blob: 3e583d79bb56989676b3de3a20abc2bfad2f4366 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Martin v. Löwis68192102007-07-21 06:55:02 +0000315 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis68192102007-07-21 06:55:02 +0000329 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +0000557 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000607 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000717 Py_Type(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
Walter Dörwald6e390802007-08-17 16:41:28 +00001507/* --- UTF-32 Codec ------------------------------------------------------- */
1508
1509PyObject *
1510PyUnicode_DecodeUTF32(const char *s,
1511 Py_ssize_t size,
1512 const char *errors,
1513 int *byteorder)
1514{
1515 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF32Stateful(const char *s,
1520 Py_ssize_t size,
1521 const char *errors,
1522 int *byteorder,
1523 Py_ssize_t *consumed)
1524{
1525 const char *starts = s;
1526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
1529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
1531#ifndef Py_UNICODE_WIDE
1532 int i, pairs;
1533#else
1534 const int pairs = 0;
1535#endif
1536 const unsigned char *q, *e;
1537 int bo = 0; /* assume native ordering by default */
1538 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00001539 /* Offsets from q for retrieving bytes in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541 int iorder[] = {0, 1, 2, 3};
1542#else
1543 int iorder[] = {3, 2, 1, 0};
1544#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00001545 /* On narrow builds we split characters outside the BMP into two
1546 codepoints => count how much extra space we need. */
1547#ifndef Py_UNICODE_WIDE
1548 for (i = pairs = 0; i < size/4; i++)
1549 if (((Py_UCS4 *)s)[i] >= 0x10000)
1550 pairs++;
1551#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00001552 PyObject *errorHandler = NULL;
1553 PyObject *exc = NULL;
1554
1555 /* This might be one to much, because of a BOM */
1556 unicode = _PyUnicode_New((size+3)/4+pairs);
1557 if (!unicode)
1558 return NULL;
1559 if (size == 0)
1560 return (PyObject *)unicode;
1561
1562 /* Unpack UTF-32 encoded data */
1563 p = unicode->str;
1564 q = (unsigned char *)s;
1565 e = q + size;
1566
1567 if (byteorder)
1568 bo = *byteorder;
1569
1570 /* Check for BOM marks (U+FEFF) in the input and adjust current
1571 byte order setting accordingly. In native mode, the leading BOM
1572 mark is skipped, in all other modes, it is copied to the output
1573 stream as-is (giving a ZWNBSP character). */
1574 if (bo == 0) {
1575 if (size >= 4) {
1576 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1577 (q[iorder[1]] << 8) | q[iorder[0]];
1578#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1579 if (bom == 0x0000FEFF) {
1580 q += 4;
1581 bo = -1;
1582 }
1583 else if (bom == 0xFFFE0000) {
1584 q += 4;
1585 bo = 1;
1586 }
1587#else
1588 if (bom == 0x0000FEFF) {
1589 q += 4;
1590 bo = 1;
1591 }
1592 else if (bom == 0xFFFE0000) {
1593 q += 4;
1594 bo = -1;
1595 }
1596#endif
1597 }
1598 }
1599
1600 if (bo == -1) {
1601 /* force LE */
1602 iorder[0] = 0;
1603 iorder[1] = 1;
1604 iorder[2] = 2;
1605 iorder[3] = 3;
1606 }
1607 else if (bo == 1) {
1608 /* force BE */
1609 iorder[0] = 3;
1610 iorder[1] = 2;
1611 iorder[2] = 1;
1612 iorder[3] = 0;
1613 }
1614
1615 while (q < e) {
1616 Py_UCS4 ch;
1617 /* remaining bytes at the end? (size should be divisible by 4) */
1618 if (e-q<4) {
1619 if (consumed)
1620 break;
1621 errmsg = "truncated data";
1622 startinpos = ((const char *)q)-starts;
1623 endinpos = ((const char *)e)-starts;
1624 goto utf32Error;
1625 /* The remaining input chars are ignored if the callback
1626 chooses to skip the input */
1627 }
1628 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1629 (q[iorder[1]] << 8) | q[iorder[0]];
1630
1631 if (ch >= 0x110000)
1632 {
1633 errmsg = "codepoint not in range(0x110000)";
1634 startinpos = ((const char *)q)-starts;
1635 endinpos = startinpos+4;
1636 goto utf32Error;
1637 }
1638#ifndef Py_UNICODE_WIDE
1639 if (ch >= 0x10000)
1640 {
1641 *p++ = 0xD800 | ((ch-0x10000) >> 10);
1642 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
1643 }
1644 else
1645#endif
1646 *p++ = ch;
1647 q += 4;
1648 continue;
1649 utf32Error:
1650 outpos = p-PyUnicode_AS_UNICODE(unicode);
1651 if (unicode_decode_call_errorhandler(
1652 errors, &errorHandler,
1653 "utf32", errmsg,
1654 starts, size, &startinpos, &endinpos, &exc, &s,
1655 (PyObject **)&unicode, &outpos, &p))
1656 goto onError;
1657 }
1658
1659 if (byteorder)
1660 *byteorder = bo;
1661
1662 if (consumed)
1663 *consumed = (const char *)q-starts;
1664
1665 /* Adjust length */
1666 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1667 goto onError;
1668
1669 Py_XDECREF(errorHandler);
1670 Py_XDECREF(exc);
1671 return (PyObject *)unicode;
1672
1673onError:
1674 Py_DECREF(unicode);
1675 Py_XDECREF(errorHandler);
1676 Py_XDECREF(exc);
1677 return NULL;
1678}
1679
1680PyObject *
1681PyUnicode_EncodeUTF32(const Py_UNICODE *s,
1682 Py_ssize_t size,
1683 const char *errors,
1684 int byteorder)
1685{
1686 PyObject *v;
1687 unsigned char *p;
1688#ifndef Py_UNICODE_WIDE
1689 int i, pairs;
1690#else
1691 const int pairs = 0;
1692#endif
1693 /* Offsets from p for storing byte pairs in the right order. */
1694#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1695 int iorder[] = {0, 1, 2, 3};
1696#else
1697 int iorder[] = {3, 2, 1, 0};
1698#endif
1699
1700#define STORECHAR(CH) \
1701 do { \
1702 p[iorder[3]] = ((CH) >> 24) & 0xff; \
1703 p[iorder[2]] = ((CH) >> 16) & 0xff; \
1704 p[iorder[1]] = ((CH) >> 8) & 0xff; \
1705 p[iorder[0]] = (CH) & 0xff; \
1706 p += 4; \
1707 } while(0)
1708
1709 /* In narrow builds we can output surrogate pairs as one codepoint,
1710 so we need less space. */
1711#ifndef Py_UNICODE_WIDE
1712 for (i = pairs = 0; i < size-1; i++)
1713 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
1714 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
1715 pairs++;
1716#endif
1717 v = PyString_FromStringAndSize(NULL,
1718 4 * (size - pairs + (byteorder == 0)));
1719 if (v == NULL)
1720 return NULL;
1721
1722 p = (unsigned char *)PyString_AS_STRING(v);
1723 if (byteorder == 0)
1724 STORECHAR(0xFEFF);
1725 if (size == 0)
1726 return v;
1727
1728 if (byteorder == -1) {
1729 /* force LE */
1730 iorder[0] = 0;
1731 iorder[1] = 1;
1732 iorder[2] = 2;
1733 iorder[3] = 3;
1734 }
1735 else if (byteorder == 1) {
1736 /* force BE */
1737 iorder[0] = 3;
1738 iorder[1] = 2;
1739 iorder[2] = 1;
1740 iorder[3] = 0;
1741 }
1742
1743 while (size-- > 0) {
1744 Py_UCS4 ch = *s++;
1745#ifndef Py_UNICODE_WIDE
1746 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
1747 Py_UCS4 ch2 = *s;
1748 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1749 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1750 s++;
1751 size--;
1752 }
1753 }
1754#endif
1755 STORECHAR(ch);
1756 }
1757 return v;
1758#undef STORECHAR
1759}
1760
1761PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
1762{
1763 if (!PyUnicode_Check(unicode)) {
1764 PyErr_BadArgument();
1765 return NULL;
1766 }
1767 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
1768 PyUnicode_GET_SIZE(unicode),
1769 NULL,
1770 0);
1771}
1772
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773/* --- UTF-16 Codec ------------------------------------------------------- */
1774
Tim Peters772747b2001-08-09 22:21:55 +00001775PyObject *
1776PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001778 const char *errors,
1779 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780{
Walter Dörwald69652032004-09-07 20:24:22 +00001781 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1782}
1783
1784PyObject *
1785PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001786 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001787 const char *errors,
1788 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001789 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001791 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001792 Py_ssize_t startinpos;
1793 Py_ssize_t endinpos;
1794 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 PyUnicodeObject *unicode;
1796 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001797 const unsigned char *q, *e;
1798 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001799 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001800 /* Offsets from q for retrieving byte pairs in the right order. */
1801#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1802 int ihi = 1, ilo = 0;
1803#else
1804 int ihi = 0, ilo = 1;
1805#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 PyObject *errorHandler = NULL;
1807 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
1809 /* Note: size will always be longer than the resulting Unicode
1810 character count */
1811 unicode = _PyUnicode_New(size);
1812 if (!unicode)
1813 return NULL;
1814 if (size == 0)
1815 return (PyObject *)unicode;
1816
1817 /* Unpack UTF-16 encoded data */
1818 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001819 q = (unsigned char *)s;
1820 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821
1822 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001823 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001825 /* Check for BOM marks (U+FEFF) in the input and adjust current
1826 byte order setting accordingly. In native mode, the leading BOM
1827 mark is skipped, in all other modes, it is copied to the output
1828 stream as-is (giving a ZWNBSP character). */
1829 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001830 if (size >= 2) {
1831 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001832#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001833 if (bom == 0xFEFF) {
1834 q += 2;
1835 bo = -1;
1836 }
1837 else if (bom == 0xFFFE) {
1838 q += 2;
1839 bo = 1;
1840 }
Tim Petersced69f82003-09-16 20:30:58 +00001841#else
Walter Dörwald69652032004-09-07 20:24:22 +00001842 if (bom == 0xFEFF) {
1843 q += 2;
1844 bo = 1;
1845 }
1846 else if (bom == 0xFFFE) {
1847 q += 2;
1848 bo = -1;
1849 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001850#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001851 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853
Tim Peters772747b2001-08-09 22:21:55 +00001854 if (bo == -1) {
1855 /* force LE */
1856 ihi = 1;
1857 ilo = 0;
1858 }
1859 else if (bo == 1) {
1860 /* force BE */
1861 ihi = 0;
1862 ilo = 1;
1863 }
1864
1865 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001867 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001869 if (consumed)
1870 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 errmsg = "truncated data";
1872 startinpos = ((const char *)q)-starts;
1873 endinpos = ((const char *)e)-starts;
1874 goto utf16Error;
1875 /* The remaining input chars are ignored if the callback
1876 chooses to skip the input */
1877 }
1878 ch = (q[ihi] << 8) | q[ilo];
1879
Tim Peters772747b2001-08-09 22:21:55 +00001880 q += 2;
1881
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 if (ch < 0xD800 || ch > 0xDFFF) {
1883 *p++ = ch;
1884 continue;
1885 }
1886
1887 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001888 if (q >= e) {
1889 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 startinpos = (((const char *)q)-2)-starts;
1891 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001892 goto utf16Error;
1893 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001894 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001895 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1896 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001897 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001898#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001899 *p++ = ch;
1900 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901#else
1902 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001903#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905 }
1906 else {
1907 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001908 startinpos = (((const char *)q)-4)-starts;
1909 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001910 goto utf16Error;
1911 }
1912
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001914 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 startinpos = (((const char *)q)-2)-starts;
1916 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001917 /* Fall through to report the error */
1918
1919 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001920 outpos = p-PyUnicode_AS_UNICODE(unicode);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "utf16", errmsg,
1924 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1925 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001926 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 }
1928
1929 if (byteorder)
1930 *byteorder = bo;
1931
Walter Dörwald69652032004-09-07 20:24:22 +00001932 if (consumed)
1933 *consumed = (const char *)q-starts;
1934
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001936 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 goto onError;
1938
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001939 Py_XDECREF(errorHandler);
1940 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001941 return (PyObject *)unicode;
1942
1943onError:
1944 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001945 Py_XDECREF(errorHandler);
1946 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 return NULL;
1948}
1949
Tim Peters772747b2001-08-09 22:21:55 +00001950PyObject *
1951PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001952 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001953 const char *errors,
1954 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955{
1956 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001957 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001958#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001959 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001960#else
1961 const int pairs = 0;
1962#endif
Tim Peters772747b2001-08-09 22:21:55 +00001963 /* Offsets from p for storing byte pairs in the right order. */
1964#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1965 int ihi = 1, ilo = 0;
1966#else
1967 int ihi = 0, ilo = 1;
1968#endif
1969
1970#define STORECHAR(CH) \
1971 do { \
1972 p[ihi] = ((CH) >> 8) & 0xff; \
1973 p[ilo] = (CH) & 0xff; \
1974 p += 2; \
1975 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001977#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001978 for (i = pairs = 0; i < size; i++)
1979 if (s[i] >= 0x10000)
1980 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001981#endif
Tim Petersced69f82003-09-16 20:30:58 +00001982 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001983 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 if (v == NULL)
1985 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986
Tim Peters772747b2001-08-09 22:21:55 +00001987 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001989 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001990 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001991 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001992
1993 if (byteorder == -1) {
1994 /* force LE */
1995 ihi = 1;
1996 ilo = 0;
1997 }
1998 else if (byteorder == 1) {
1999 /* force BE */
2000 ihi = 0;
2001 ilo = 1;
2002 }
2003
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 while (size-- > 0) {
2005 Py_UNICODE ch = *s++;
2006 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002007#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002008 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002009 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2010 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002012#endif
Tim Peters772747b2001-08-09 22:21:55 +00002013 STORECHAR(ch);
2014 if (ch2)
2015 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002018#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019}
2020
2021PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2022{
2023 if (!PyUnicode_Check(unicode)) {
2024 PyErr_BadArgument();
2025 return NULL;
2026 }
2027 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2028 PyUnicode_GET_SIZE(unicode),
2029 NULL,
2030 0);
2031}
2032
2033/* --- Unicode Escape Codec ----------------------------------------------- */
2034
Fredrik Lundh06d12682001-01-24 07:59:11 +00002035static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002036
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002038 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 const char *errors)
2040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002042 Py_ssize_t startinpos;
2043 Py_ssize_t endinpos;
2044 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002049 char* message;
2050 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002051 PyObject *errorHandler = NULL;
2052 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002053
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 /* Escaped strings will always be longer than the resulting
2055 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 length after conversion to the true value.
2057 (but if the error callback returns a long replacement string
2058 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 v = _PyUnicode_New(size);
2060 if (v == NULL)
2061 goto onError;
2062 if (size == 0)
2063 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002065 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002067
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 while (s < end) {
2069 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002070 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072
2073 /* Non-escape characters are interpreted as Unicode ordinals */
2074 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002075 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 continue;
2077 }
2078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 /* \ - Escapes */
2081 s++;
2082 switch (*s++) {
2083
2084 /* \x escapes */
2085 case '\n': break;
2086 case '\\': *p++ = '\\'; break;
2087 case '\'': *p++ = '\''; break;
2088 case '\"': *p++ = '\"'; break;
2089 case 'b': *p++ = '\b'; break;
2090 case 'f': *p++ = '\014'; break; /* FF */
2091 case 't': *p++ = '\t'; break;
2092 case 'n': *p++ = '\n'; break;
2093 case 'r': *p++ = '\r'; break;
2094 case 'v': *p++ = '\013'; break; /* VT */
2095 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2096
2097 /* \OOO (octal) escapes */
2098 case '0': case '1': case '2': case '3':
2099 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002100 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002102 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002104 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002106 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 break;
2108
Fredrik Lundhccc74732001-02-18 22:13:49 +00002109 /* hex escapes */
2110 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002112 digits = 2;
2113 message = "truncated \\xXX escape";
2114 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115
Fredrik Lundhccc74732001-02-18 22:13:49 +00002116 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002118 digits = 4;
2119 message = "truncated \\uXXXX escape";
2120 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121
Fredrik Lundhccc74732001-02-18 22:13:49 +00002122 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002123 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002124 digits = 8;
2125 message = "truncated \\UXXXXXXXX escape";
2126 hexescape:
2127 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 outpos = p-PyUnicode_AS_UNICODE(v);
2129 if (s+digits>end) {
2130 endinpos = size;
2131 if (unicode_decode_call_errorhandler(
2132 errors, &errorHandler,
2133 "unicodeescape", "end of string in escape sequence",
2134 starts, size, &startinpos, &endinpos, &exc, &s,
2135 (PyObject **)&v, &outpos, &p))
2136 goto onError;
2137 goto nextByte;
2138 }
2139 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002140 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002141 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 endinpos = (s+i+1)-starts;
2143 if (unicode_decode_call_errorhandler(
2144 errors, &errorHandler,
2145 "unicodeescape", message,
2146 starts, size, &startinpos, &endinpos, &exc, &s,
2147 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002148 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002149 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002150 }
2151 chr = (chr<<4) & ~0xF;
2152 if (c >= '0' && c <= '9')
2153 chr += c - '0';
2154 else if (c >= 'a' && c <= 'f')
2155 chr += 10 + c - 'a';
2156 else
2157 chr += 10 + c - 'A';
2158 }
2159 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002160 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002161 /* _decoding_error will have already written into the
2162 target buffer. */
2163 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002164 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002165 /* when we get here, chr is a 32-bit unicode character */
2166 if (chr <= 0xffff)
2167 /* UCS-2 character */
2168 *p++ = (Py_UNICODE) chr;
2169 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002170 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002171 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002172#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002173 *p++ = chr;
2174#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002175 chr -= 0x10000L;
2176 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002177 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002178#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002179 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002180 endinpos = s-starts;
2181 outpos = p-PyUnicode_AS_UNICODE(v);
2182 if (unicode_decode_call_errorhandler(
2183 errors, &errorHandler,
2184 "unicodeescape", "illegal Unicode character",
2185 starts, size, &startinpos, &endinpos, &exc, &s,
2186 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002187 goto onError;
2188 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002189 break;
2190
2191 /* \N{name} */
2192 case 'N':
2193 message = "malformed \\N character escape";
2194 if (ucnhash_CAPI == NULL) {
2195 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002196 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002197 m = PyImport_ImportModule("unicodedata");
2198 if (m == NULL)
2199 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002200 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002201 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002202 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002203 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002204 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002205 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002206 if (ucnhash_CAPI == NULL)
2207 goto ucnhashError;
2208 }
2209 if (*s == '{') {
2210 const char *start = s+1;
2211 /* look for the closing brace */
2212 while (*s != '}' && s < end)
2213 s++;
2214 if (s > start && s < end && *s == '}') {
2215 /* found a name. look it up in the unicode database */
2216 message = "unknown Unicode character name";
2217 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002218 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002219 goto store;
2220 }
2221 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002222 endinpos = s-starts;
2223 outpos = p-PyUnicode_AS_UNICODE(v);
2224 if (unicode_decode_call_errorhandler(
2225 errors, &errorHandler,
2226 "unicodeescape", message,
2227 starts, size, &startinpos, &endinpos, &exc, &s,
2228 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002229 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002230 break;
2231
2232 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002233 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002234 message = "\\ at end of string";
2235 s--;
2236 endinpos = s-starts;
2237 outpos = p-PyUnicode_AS_UNICODE(v);
2238 if (unicode_decode_call_errorhandler(
2239 errors, &errorHandler,
2240 "unicodeescape", message,
2241 starts, size, &startinpos, &endinpos, &exc, &s,
2242 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002243 goto onError;
2244 }
2245 else {
2246 *p++ = '\\';
2247 *p++ = (unsigned char)s[-1];
2248 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002249 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251 nextByte:
2252 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002253 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002256 Py_XDECREF(errorHandler);
2257 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002259
Fredrik Lundhccc74732001-02-18 22:13:49 +00002260ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002261 PyErr_SetString(
2262 PyExc_UnicodeError,
2263 "\\N escapes not supported (can't load unicodedata module)"
2264 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002265 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 Py_XDECREF(errorHandler);
2267 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002268 return NULL;
2269
Fredrik Lundhccc74732001-02-18 22:13:49 +00002270onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return NULL;
2275}
2276
2277/* Return a Unicode-Escape string version of the Unicode object.
2278
2279 If quotes is true, the string is enclosed in u"" or u'' quotes as
2280 appropriate.
2281
2282*/
2283
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002284Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002285 Py_ssize_t size,
2286 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002287{
2288 /* like wcschr, but doesn't stop at NULL characters */
2289
2290 while (size-- > 0) {
2291 if (*s == ch)
2292 return s;
2293 s++;
2294 }
2295
2296 return NULL;
2297}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002298
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299static
2300PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002301 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 int quotes)
2303{
2304 PyObject *repr;
2305 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002307 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
Neal Norwitz17753ec2006-08-21 22:21:19 +00002309 /* XXX(nnorwitz): rather than over-allocating, it would be
2310 better to choose a different scheme. Perhaps scan the
2311 first N-chars of the string and allocate based on that size.
2312 */
2313 /* Initial allocation is based on the longest-possible unichr
2314 escape.
2315
2316 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2317 unichr, so in this case it's the longest unichr escape. In
2318 narrow (UTF-16) builds this is five chars per source unichr
2319 since there are two unichrs in the surrogate pair, so in narrow
2320 (UTF-16) builds it's not the longest unichr escape.
2321
2322 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2323 so in the narrow (UTF-16) build case it's the longest unichr
2324 escape.
2325 */
2326
2327 repr = PyString_FromStringAndSize(NULL,
2328 2
2329#ifdef Py_UNICODE_WIDE
2330 + 10*size
2331#else
2332 + 6*size
2333#endif
2334 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335 if (repr == NULL)
2336 return NULL;
2337
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002338 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002339
2340 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002342 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 !findchar(s, size, '"')) ? '"' : '\'';
2344 }
2345 while (size-- > 0) {
2346 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002347
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002348 /* Escape quotes and backslashes */
2349 if ((quotes &&
2350 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 *p++ = '\\';
2352 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002353 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002354 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002355
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002356#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002357 /* Map 21-bit characters to '\U00xxxxxx' */
2358 else if (ch >= 0x10000) {
2359 *p++ = '\\';
2360 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002361 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2362 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2363 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2364 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2365 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2366 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2367 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002368 *p++ = hexdigit[ch & 0x0000000F];
2369 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002370 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002371#else
2372 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002373 else if (ch >= 0xD800 && ch < 0xDC00) {
2374 Py_UNICODE ch2;
2375 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002376
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002377 ch2 = *s++;
2378 size--;
2379 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2380 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2381 *p++ = '\\';
2382 *p++ = 'U';
2383 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2384 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2385 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2386 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2387 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2388 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2389 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2390 *p++ = hexdigit[ucs & 0x0000000F];
2391 continue;
2392 }
2393 /* Fall through: isolated surrogates are copied as-is */
2394 s--;
2395 size++;
2396 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002397#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002398
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002400 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401 *p++ = '\\';
2402 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002403 *p++ = hexdigit[(ch >> 12) & 0x000F];
2404 *p++ = hexdigit[(ch >> 8) & 0x000F];
2405 *p++ = hexdigit[(ch >> 4) & 0x000F];
2406 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002407 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002408
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002409 /* Map special whitespace to '\t', \n', '\r' */
2410 else if (ch == '\t') {
2411 *p++ = '\\';
2412 *p++ = 't';
2413 }
2414 else if (ch == '\n') {
2415 *p++ = '\\';
2416 *p++ = 'n';
2417 }
2418 else if (ch == '\r') {
2419 *p++ = '\\';
2420 *p++ = 'r';
2421 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002422
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002423 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002424 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002426 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002427 *p++ = hexdigit[(ch >> 4) & 0x000F];
2428 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002429 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002430
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 /* Copy everything else as-is */
2432 else
2433 *p++ = (char) ch;
2434 }
2435 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002436 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437
2438 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002439 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440 return repr;
2441}
2442
2443PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002444 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445{
2446 return unicodeescape_string(s, size, 0);
2447}
2448
2449PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2450{
2451 if (!PyUnicode_Check(unicode)) {
2452 PyErr_BadArgument();
2453 return NULL;
2454 }
2455 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2456 PyUnicode_GET_SIZE(unicode));
2457}
2458
2459/* --- Raw Unicode Escape Codec ------------------------------------------- */
2460
2461PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002462 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 const char *errors)
2464{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002465 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002466 Py_ssize_t startinpos;
2467 Py_ssize_t endinpos;
2468 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 const char *end;
2472 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002473 PyObject *errorHandler = NULL;
2474 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002475
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 /* Escaped strings will always be longer than the resulting
2477 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 length after conversion to the true value. (But decoding error
2479 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 v = _PyUnicode_New(size);
2481 if (v == NULL)
2482 goto onError;
2483 if (size == 0)
2484 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 end = s + size;
2487 while (s < end) {
2488 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002489 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002491 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492
2493 /* Non-escape characters are interpreted as Unicode ordinals */
2494 if (*s != '\\') {
2495 *p++ = (unsigned char)*s++;
2496 continue;
2497 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499
2500 /* \u-escapes are only interpreted iff the number of leading
2501 backslashes if odd */
2502 bs = s;
2503 for (;s < end;) {
2504 if (*s != '\\')
2505 break;
2506 *p++ = (unsigned char)*s++;
2507 }
2508 if (((s - bs) & 1) == 0 ||
2509 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002510 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 continue;
2512 }
2513 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002514 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 s++;
2516
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002517 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002519 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002520 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002522 endinpos = s-starts;
2523 if (unicode_decode_call_errorhandler(
2524 errors, &errorHandler,
2525 "rawunicodeescape", "truncated \\uXXXX",
2526 starts, size, &startinpos, &endinpos, &exc, &s,
2527 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002529 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 }
2531 x = (x<<4) & ~0xF;
2532 if (c >= '0' && c <= '9')
2533 x += c - '0';
2534 else if (c >= 'a' && c <= 'f')
2535 x += 10 + c - 'a';
2536 else
2537 x += 10 + c - 'A';
2538 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002539#ifndef Py_UNICODE_WIDE
2540 if (x > 0x10000) {
2541 if (unicode_decode_call_errorhandler(
2542 errors, &errorHandler,
2543 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2544 starts, size, &startinpos, &endinpos, &exc, &s,
2545 (PyObject **)&v, &outpos, &p))
2546 goto onError;
2547 }
2548#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 *p++ = x;
2550 nextByte:
2551 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002553 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002554 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 onError:
2560 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 return NULL;
2564}
2565
2566PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002567 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568{
2569 PyObject *repr;
2570 char *p;
2571 char *q;
2572
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002573 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002575#ifdef Py_UNICODE_WIDE
2576 repr = PyString_FromStringAndSize(NULL, 10 * size);
2577#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002579#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (repr == NULL)
2581 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002582 if (size == 0)
2583 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584
2585 p = q = PyString_AS_STRING(repr);
2586 while (size-- > 0) {
2587 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002588#ifdef Py_UNICODE_WIDE
2589 /* Map 32-bit characters to '\Uxxxxxxxx' */
2590 if (ch >= 0x10000) {
2591 *p++ = '\\';
2592 *p++ = 'U';
2593 *p++ = hexdigit[(ch >> 28) & 0xf];
2594 *p++ = hexdigit[(ch >> 24) & 0xf];
2595 *p++ = hexdigit[(ch >> 20) & 0xf];
2596 *p++ = hexdigit[(ch >> 16) & 0xf];
2597 *p++ = hexdigit[(ch >> 12) & 0xf];
2598 *p++ = hexdigit[(ch >> 8) & 0xf];
2599 *p++ = hexdigit[(ch >> 4) & 0xf];
2600 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002601 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002602 else
2603#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 /* Map 16-bit characters to '\uxxxx' */
2605 if (ch >= 256) {
2606 *p++ = '\\';
2607 *p++ = 'u';
2608 *p++ = hexdigit[(ch >> 12) & 0xf];
2609 *p++ = hexdigit[(ch >> 8) & 0xf];
2610 *p++ = hexdigit[(ch >> 4) & 0xf];
2611 *p++ = hexdigit[ch & 15];
2612 }
2613 /* Copy everything else as-is */
2614 else
2615 *p++ = (char) ch;
2616 }
2617 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002618 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 return repr;
2620}
2621
2622PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2623{
2624 if (!PyUnicode_Check(unicode)) {
2625 PyErr_BadArgument();
2626 return NULL;
2627 }
2628 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2629 PyUnicode_GET_SIZE(unicode));
2630}
2631
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002632/* --- Unicode Internal Codec ------------------------------------------- */
2633
2634PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002635 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002636 const char *errors)
2637{
2638 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002639 Py_ssize_t startinpos;
2640 Py_ssize_t endinpos;
2641 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002642 PyUnicodeObject *v;
2643 Py_UNICODE *p;
2644 const char *end;
2645 const char *reason;
2646 PyObject *errorHandler = NULL;
2647 PyObject *exc = NULL;
2648
Neal Norwitzd43069c2006-01-08 01:12:10 +00002649#ifdef Py_UNICODE_WIDE
2650 Py_UNICODE unimax = PyUnicode_GetMax();
2651#endif
2652
Armin Rigo7ccbca92006-10-04 12:17:45 +00002653 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002654 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2655 if (v == NULL)
2656 goto onError;
2657 if (PyUnicode_GetSize((PyObject *)v) == 0)
2658 return (PyObject *)v;
2659 p = PyUnicode_AS_UNICODE(v);
2660 end = s + size;
2661
2662 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002663 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002664 /* We have to sanity check the raw data, otherwise doom looms for
2665 some malformed UCS-4 data. */
2666 if (
2667 #ifdef Py_UNICODE_WIDE
2668 *p > unimax || *p < 0 ||
2669 #endif
2670 end-s < Py_UNICODE_SIZE
2671 )
2672 {
2673 startinpos = s - starts;
2674 if (end-s < Py_UNICODE_SIZE) {
2675 endinpos = end-starts;
2676 reason = "truncated input";
2677 }
2678 else {
2679 endinpos = s - starts + Py_UNICODE_SIZE;
2680 reason = "illegal code point (> 0x10FFFF)";
2681 }
2682 outpos = p - PyUnicode_AS_UNICODE(v);
2683 if (unicode_decode_call_errorhandler(
2684 errors, &errorHandler,
2685 "unicode_internal", reason,
2686 starts, size, &startinpos, &endinpos, &exc, &s,
2687 (PyObject **)&v, &outpos, &p)) {
2688 goto onError;
2689 }
2690 }
2691 else {
2692 p++;
2693 s += Py_UNICODE_SIZE;
2694 }
2695 }
2696
Martin v. Löwis412fb672006-04-13 06:34:32 +00002697 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002698 goto onError;
2699 Py_XDECREF(errorHandler);
2700 Py_XDECREF(exc);
2701 return (PyObject *)v;
2702
2703 onError:
2704 Py_XDECREF(v);
2705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
2707 return NULL;
2708}
2709
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710/* --- Latin-1 Codec ------------------------------------------------------ */
2711
2712PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002713 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 const char *errors)
2715{
2716 PyUnicodeObject *v;
2717 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002718
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002720 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002721 Py_UNICODE r = *(unsigned char*)s;
2722 return PyUnicode_FromUnicode(&r, 1);
2723 }
2724
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 v = _PyUnicode_New(size);
2726 if (v == NULL)
2727 goto onError;
2728 if (size == 0)
2729 return (PyObject *)v;
2730 p = PyUnicode_AS_UNICODE(v);
2731 while (size-- > 0)
2732 *p++ = (unsigned char)*s++;
2733 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002734
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 onError:
2736 Py_XDECREF(v);
2737 return NULL;
2738}
2739
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740/* create or adjust a UnicodeEncodeError */
2741static void make_encode_exception(PyObject **exceptionObject,
2742 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 const Py_UNICODE *unicode, Py_ssize_t size,
2744 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002745 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 if (*exceptionObject == NULL) {
2748 *exceptionObject = PyUnicodeEncodeError_Create(
2749 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
2751 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2753 goto onError;
2754 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2755 goto onError;
2756 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2757 goto onError;
2758 return;
2759 onError:
2760 Py_DECREF(*exceptionObject);
2761 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
2763}
2764
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765/* raises a UnicodeEncodeError */
2766static void raise_encode_exception(PyObject **exceptionObject,
2767 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002768 const Py_UNICODE *unicode, Py_ssize_t size,
2769 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002770 const char *reason)
2771{
2772 make_encode_exception(exceptionObject,
2773 encoding, unicode, size, startpos, endpos, reason);
2774 if (*exceptionObject != NULL)
2775 PyCodec_StrictErrors(*exceptionObject);
2776}
2777
2778/* error handling callback helper:
2779 build arguments, call the callback and check the arguments,
2780 put the result into newpos and return the replacement string, which
2781 has to be freed by the caller */
2782static PyObject *unicode_encode_call_errorhandler(const char *errors,
2783 PyObject **errorHandler,
2784 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002785 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2786 Py_ssize_t startpos, Py_ssize_t endpos,
2787 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002789 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790
2791 PyObject *restuple;
2792 PyObject *resunicode;
2793
2794 if (*errorHandler == NULL) {
2795 *errorHandler = PyCodec_LookupError(errors);
2796 if (*errorHandler == NULL)
2797 return NULL;
2798 }
2799
2800 make_encode_exception(exceptionObject,
2801 encoding, unicode, size, startpos, endpos, reason);
2802 if (*exceptionObject == NULL)
2803 return NULL;
2804
2805 restuple = PyObject_CallFunctionObjArgs(
2806 *errorHandler, *exceptionObject, NULL);
2807 if (restuple == NULL)
2808 return NULL;
2809 if (!PyTuple_Check(restuple)) {
2810 PyErr_Format(PyExc_TypeError, &argparse[4]);
2811 Py_DECREF(restuple);
2812 return NULL;
2813 }
2814 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2815 &resunicode, newpos)) {
2816 Py_DECREF(restuple);
2817 return NULL;
2818 }
2819 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002820 *newpos = size+*newpos;
2821 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002822 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002823 Py_DECREF(restuple);
2824 return NULL;
2825 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 Py_INCREF(resunicode);
2827 Py_DECREF(restuple);
2828 return resunicode;
2829}
2830
2831static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833 const char *errors,
2834 int limit)
2835{
2836 /* output object */
2837 PyObject *res;
2838 /* pointers to the beginning and end+1 of input */
2839 const Py_UNICODE *startp = p;
2840 const Py_UNICODE *endp = p + size;
2841 /* pointer to the beginning of the unencodable characters */
2842 /* const Py_UNICODE *badp = NULL; */
2843 /* pointer into the output */
2844 char *str;
2845 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002846 Py_ssize_t respos = 0;
2847 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002848 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2849 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 PyObject *errorHandler = NULL;
2851 PyObject *exc = NULL;
2852 /* the following variable is used for caching string comparisons
2853 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2854 int known_errorHandler = -1;
2855
2856 /* allocate enough for a simple encoding without
2857 replacements, if we need more, we'll resize */
2858 res = PyString_FromStringAndSize(NULL, size);
2859 if (res == NULL)
2860 goto onError;
2861 if (size == 0)
2862 return res;
2863 str = PyString_AS_STRING(res);
2864 ressize = size;
2865
2866 while (p<endp) {
2867 Py_UNICODE c = *p;
2868
2869 /* can we encode this? */
2870 if (c<limit) {
2871 /* no overflow check, because we know that the space is enough */
2872 *str++ = (char)c;
2873 ++p;
2874 }
2875 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002876 Py_ssize_t unicodepos = p-startp;
2877 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002878 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002879 Py_ssize_t repsize;
2880 Py_ssize_t newpos;
2881 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 Py_UNICODE *uni2;
2883 /* startpos for collecting unencodable chars */
2884 const Py_UNICODE *collstart = p;
2885 const Py_UNICODE *collend = p;
2886 /* find all unecodable characters */
2887 while ((collend < endp) && ((*collend)>=limit))
2888 ++collend;
2889 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2890 if (known_errorHandler==-1) {
2891 if ((errors==NULL) || (!strcmp(errors, "strict")))
2892 known_errorHandler = 1;
2893 else if (!strcmp(errors, "replace"))
2894 known_errorHandler = 2;
2895 else if (!strcmp(errors, "ignore"))
2896 known_errorHandler = 3;
2897 else if (!strcmp(errors, "xmlcharrefreplace"))
2898 known_errorHandler = 4;
2899 else
2900 known_errorHandler = 0;
2901 }
2902 switch (known_errorHandler) {
2903 case 1: /* strict */
2904 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2905 goto onError;
2906 case 2: /* replace */
2907 while (collstart++<collend)
2908 *str++ = '?'; /* fall through */
2909 case 3: /* ignore */
2910 p = collend;
2911 break;
2912 case 4: /* xmlcharrefreplace */
2913 respos = str-PyString_AS_STRING(res);
2914 /* determine replacement size (temporarily (mis)uses p) */
2915 for (p = collstart, repsize = 0; p < collend; ++p) {
2916 if (*p<10)
2917 repsize += 2+1+1;
2918 else if (*p<100)
2919 repsize += 2+2+1;
2920 else if (*p<1000)
2921 repsize += 2+3+1;
2922 else if (*p<10000)
2923 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002924#ifndef Py_UNICODE_WIDE
2925 else
2926 repsize += 2+5+1;
2927#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928 else if (*p<100000)
2929 repsize += 2+5+1;
2930 else if (*p<1000000)
2931 repsize += 2+6+1;
2932 else
2933 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002934#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002935 }
2936 requiredsize = respos+repsize+(endp-collend);
2937 if (requiredsize > ressize) {
2938 if (requiredsize<2*ressize)
2939 requiredsize = 2*ressize;
2940 if (_PyString_Resize(&res, requiredsize))
2941 goto onError;
2942 str = PyString_AS_STRING(res) + respos;
2943 ressize = requiredsize;
2944 }
2945 /* generate replacement (temporarily (mis)uses p) */
2946 for (p = collstart; p < collend; ++p) {
2947 str += sprintf(str, "&#%d;", (int)*p);
2948 }
2949 p = collend;
2950 break;
2951 default:
2952 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2953 encoding, reason, startp, size, &exc,
2954 collstart-startp, collend-startp, &newpos);
2955 if (repunicode == NULL)
2956 goto onError;
2957 /* need more space? (at least enough for what we
2958 have+the replacement+the rest of the string, so
2959 we won't have to check space for encodable characters) */
2960 respos = str-PyString_AS_STRING(res);
2961 repsize = PyUnicode_GET_SIZE(repunicode);
2962 requiredsize = respos+repsize+(endp-collend);
2963 if (requiredsize > ressize) {
2964 if (requiredsize<2*ressize)
2965 requiredsize = 2*ressize;
2966 if (_PyString_Resize(&res, requiredsize)) {
2967 Py_DECREF(repunicode);
2968 goto onError;
2969 }
2970 str = PyString_AS_STRING(res) + respos;
2971 ressize = requiredsize;
2972 }
2973 /* check if there is anything unencodable in the replacement
2974 and copy it to the output */
2975 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2976 c = *uni2;
2977 if (c >= limit) {
2978 raise_encode_exception(&exc, encoding, startp, size,
2979 unicodepos, unicodepos+1, reason);
2980 Py_DECREF(repunicode);
2981 goto onError;
2982 }
2983 *str = (char)c;
2984 }
2985 p = startp + newpos;
2986 Py_DECREF(repunicode);
2987 }
2988 }
2989 }
2990 /* Resize if we allocated to much */
2991 respos = str-PyString_AS_STRING(res);
2992 if (respos<ressize)
2993 /* If this falls res will be NULL */
2994 _PyString_Resize(&res, respos);
2995 Py_XDECREF(errorHandler);
2996 Py_XDECREF(exc);
2997 return res;
2998
2999 onError:
3000 Py_XDECREF(res);
3001 Py_XDECREF(errorHandler);
3002 Py_XDECREF(exc);
3003 return NULL;
3004}
3005
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003007 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 const char *errors)
3009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003010 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011}
3012
3013PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3014{
3015 if (!PyUnicode_Check(unicode)) {
3016 PyErr_BadArgument();
3017 return NULL;
3018 }
3019 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3020 PyUnicode_GET_SIZE(unicode),
3021 NULL);
3022}
3023
3024/* --- 7-bit ASCII Codec -------------------------------------------------- */
3025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003027 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 const char *errors)
3029{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003030 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 PyUnicodeObject *v;
3032 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003033 Py_ssize_t startinpos;
3034 Py_ssize_t endinpos;
3035 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003036 const char *e;
3037 PyObject *errorHandler = NULL;
3038 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003039
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003041 if (size == 1 && *(unsigned char*)s < 128) {
3042 Py_UNICODE r = *(unsigned char*)s;
3043 return PyUnicode_FromUnicode(&r, 1);
3044 }
Tim Petersced69f82003-09-16 20:30:58 +00003045
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 v = _PyUnicode_New(size);
3047 if (v == NULL)
3048 goto onError;
3049 if (size == 0)
3050 return (PyObject *)v;
3051 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 e = s + size;
3053 while (s < e) {
3054 register unsigned char c = (unsigned char)*s;
3055 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057 ++s;
3058 }
3059 else {
3060 startinpos = s-starts;
3061 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003062 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063 if (unicode_decode_call_errorhandler(
3064 errors, &errorHandler,
3065 "ascii", "ordinal not in range(128)",
3066 starts, size, &startinpos, &endinpos, &exc, &s,
3067 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003071 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003072 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003073 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003074 Py_XDECREF(errorHandler);
3075 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003077
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 onError:
3079 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003080 Py_XDECREF(errorHandler);
3081 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 return NULL;
3083}
3084
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003086 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 const char *errors)
3088{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003089 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090}
3091
3092PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3093{
3094 if (!PyUnicode_Check(unicode)) {
3095 PyErr_BadArgument();
3096 return NULL;
3097 }
3098 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3099 PyUnicode_GET_SIZE(unicode),
3100 NULL);
3101}
3102
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003103#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003104
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003105/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003106
Martin v. Löwisd8251432006-06-14 05:21:04 +00003107#if SIZEOF_INT < SIZEOF_SSIZE_T
3108#define NEED_RETRY
3109#endif
3110
3111/* XXX This code is limited to "true" double-byte encodings, as
3112 a) it assumes an incomplete character consists of a single byte, and
3113 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3114 encodings, see IsDBCSLeadByteEx documentation. */
3115
3116static int is_dbcs_lead_byte(const char *s, int offset)
3117{
3118 const char *curr = s + offset;
3119
3120 if (IsDBCSLeadByte(*curr)) {
3121 const char *prev = CharPrev(s, curr);
3122 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3123 }
3124 return 0;
3125}
3126
3127/*
3128 * Decode MBCS string into unicode object. If 'final' is set, converts
3129 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3130 */
3131static int decode_mbcs(PyUnicodeObject **v,
3132 const char *s, /* MBCS string */
3133 int size, /* sizeof MBCS string */
3134 int final)
3135{
3136 Py_UNICODE *p;
3137 Py_ssize_t n = 0;
3138 int usize = 0;
3139
3140 assert(size >= 0);
3141
3142 /* Skip trailing lead-byte unless 'final' is set */
3143 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3144 --size;
3145
3146 /* First get the size of the result */
3147 if (size > 0) {
3148 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3149 if (usize == 0) {
3150 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3151 return -1;
3152 }
3153 }
3154
3155 if (*v == NULL) {
3156 /* Create unicode object */
3157 *v = _PyUnicode_New(usize);
3158 if (*v == NULL)
3159 return -1;
3160 }
3161 else {
3162 /* Extend unicode object */
3163 n = PyUnicode_GET_SIZE(*v);
3164 if (_PyUnicode_Resize(v, n + usize) < 0)
3165 return -1;
3166 }
3167
3168 /* Do the conversion */
3169 if (size > 0) {
3170 p = PyUnicode_AS_UNICODE(*v) + n;
3171 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3172 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3173 return -1;
3174 }
3175 }
3176
3177 return size;
3178}
3179
3180PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3181 Py_ssize_t size,
3182 const char *errors,
3183 Py_ssize_t *consumed)
3184{
3185 PyUnicodeObject *v = NULL;
3186 int done;
3187
3188 if (consumed)
3189 *consumed = 0;
3190
3191#ifdef NEED_RETRY
3192 retry:
3193 if (size > INT_MAX)
3194 done = decode_mbcs(&v, s, INT_MAX, 0);
3195 else
3196#endif
3197 done = decode_mbcs(&v, s, (int)size, !consumed);
3198
3199 if (done < 0) {
3200 Py_XDECREF(v);
3201 return NULL;
3202 }
3203
3204 if (consumed)
3205 *consumed += done;
3206
3207#ifdef NEED_RETRY
3208 if (size > INT_MAX) {
3209 s += done;
3210 size -= done;
3211 goto retry;
3212 }
3213#endif
3214
3215 return (PyObject *)v;
3216}
3217
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003218PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003219 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003220 const char *errors)
3221{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003222 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3223}
3224
3225/*
3226 * Convert unicode into string object (MBCS).
3227 * Returns 0 if succeed, -1 otherwise.
3228 */
3229static int encode_mbcs(PyObject **repr,
3230 const Py_UNICODE *p, /* unicode */
3231 int size) /* size of unicode */
3232{
3233 int mbcssize = 0;
3234 Py_ssize_t n = 0;
3235
3236 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003237
3238 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003239 if (size > 0) {
3240 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3241 if (mbcssize == 0) {
3242 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3243 return -1;
3244 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003245 }
3246
Martin v. Löwisd8251432006-06-14 05:21:04 +00003247 if (*repr == NULL) {
3248 /* Create string object */
3249 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3250 if (*repr == NULL)
3251 return -1;
3252 }
3253 else {
3254 /* Extend string object */
3255 n = PyString_Size(*repr);
3256 if (_PyString_Resize(repr, n + mbcssize) < 0)
3257 return -1;
3258 }
3259
3260 /* Do the conversion */
3261 if (size > 0) {
3262 char *s = PyString_AS_STRING(*repr) + n;
3263 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3264 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3265 return -1;
3266 }
3267 }
3268
3269 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003270}
3271
3272PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003273 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003274 const char *errors)
3275{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003276 PyObject *repr = NULL;
3277 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003278
Martin v. Löwisd8251432006-06-14 05:21:04 +00003279#ifdef NEED_RETRY
3280 retry:
3281 if (size > INT_MAX)
3282 ret = encode_mbcs(&repr, p, INT_MAX);
3283 else
3284#endif
3285 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003286
Martin v. Löwisd8251432006-06-14 05:21:04 +00003287 if (ret < 0) {
3288 Py_XDECREF(repr);
3289 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003290 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003291
3292#ifdef NEED_RETRY
3293 if (size > INT_MAX) {
3294 p += INT_MAX;
3295 size -= INT_MAX;
3296 goto retry;
3297 }
3298#endif
3299
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003300 return repr;
3301}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003302
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003303PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3304{
3305 if (!PyUnicode_Check(unicode)) {
3306 PyErr_BadArgument();
3307 return NULL;
3308 }
3309 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3310 PyUnicode_GET_SIZE(unicode),
3311 NULL);
3312}
3313
Martin v. Löwisd8251432006-06-14 05:21:04 +00003314#undef NEED_RETRY
3315
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003316#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003317
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318/* --- Character Mapping Codec -------------------------------------------- */
3319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003321 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 PyObject *mapping,
3323 const char *errors)
3324{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003326 Py_ssize_t startinpos;
3327 Py_ssize_t endinpos;
3328 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003329 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 PyUnicodeObject *v;
3331 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003332 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003333 PyObject *errorHandler = NULL;
3334 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003335 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003336 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003337
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 /* Default to Latin-1 */
3339 if (mapping == NULL)
3340 return PyUnicode_DecodeLatin1(s, size, errors);
3341
3342 v = _PyUnicode_New(size);
3343 if (v == NULL)
3344 goto onError;
3345 if (size == 0)
3346 return (PyObject *)v;
3347 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003349 if (PyUnicode_CheckExact(mapping)) {
3350 mapstring = PyUnicode_AS_UNICODE(mapping);
3351 maplen = PyUnicode_GET_SIZE(mapping);
3352 while (s < e) {
3353 unsigned char ch = *s;
3354 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003356 if (ch < maplen)
3357 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003359 if (x == 0xfffe) {
3360 /* undefined mapping */
3361 outpos = p-PyUnicode_AS_UNICODE(v);
3362 startinpos = s-starts;
3363 endinpos = startinpos+1;
3364 if (unicode_decode_call_errorhandler(
3365 errors, &errorHandler,
3366 "charmap", "character maps to <undefined>",
3367 starts, size, &startinpos, &endinpos, &exc, &s,
3368 (PyObject **)&v, &outpos, &p)) {
3369 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003370 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003371 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003372 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003373 *p++ = x;
3374 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003376 }
3377 else {
3378 while (s < e) {
3379 unsigned char ch = *s;
3380 PyObject *w, *x;
3381
3382 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3383 w = PyInt_FromLong((long)ch);
3384 if (w == NULL)
3385 goto onError;
3386 x = PyObject_GetItem(mapping, w);
3387 Py_DECREF(w);
3388 if (x == NULL) {
3389 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3390 /* No mapping found means: mapping is undefined. */
3391 PyErr_Clear();
3392 x = Py_None;
3393 Py_INCREF(x);
3394 } else
3395 goto onError;
3396 }
3397
3398 /* Apply mapping */
3399 if (PyInt_Check(x)) {
3400 long value = PyInt_AS_LONG(x);
3401 if (value < 0 || value > 65535) {
3402 PyErr_SetString(PyExc_TypeError,
3403 "character mapping must be in range(65536)");
3404 Py_DECREF(x);
3405 goto onError;
3406 }
3407 *p++ = (Py_UNICODE)value;
3408 }
3409 else if (x == Py_None) {
3410 /* undefined mapping */
3411 outpos = p-PyUnicode_AS_UNICODE(v);
3412 startinpos = s-starts;
3413 endinpos = startinpos+1;
3414 if (unicode_decode_call_errorhandler(
3415 errors, &errorHandler,
3416 "charmap", "character maps to <undefined>",
3417 starts, size, &startinpos, &endinpos, &exc, &s,
3418 (PyObject **)&v, &outpos, &p)) {
3419 Py_DECREF(x);
3420 goto onError;
3421 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003422 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003423 continue;
3424 }
3425 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003426 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003427
3428 if (targetsize == 1)
3429 /* 1-1 mapping */
3430 *p++ = *PyUnicode_AS_UNICODE(x);
3431
3432 else if (targetsize > 1) {
3433 /* 1-n mapping */
3434 if (targetsize > extrachars) {
3435 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003436 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3437 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003438 (targetsize << 2);
3439 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003440 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003441 if (_PyUnicode_Resize(&v,
3442 PyUnicode_GET_SIZE(v) + needed) < 0) {
3443 Py_DECREF(x);
3444 goto onError;
3445 }
3446 p = PyUnicode_AS_UNICODE(v) + oldpos;
3447 }
3448 Py_UNICODE_COPY(p,
3449 PyUnicode_AS_UNICODE(x),
3450 targetsize);
3451 p += targetsize;
3452 extrachars -= targetsize;
3453 }
3454 /* 1-0 mapping: skip the character */
3455 }
3456 else {
3457 /* wrong return value */
3458 PyErr_SetString(PyExc_TypeError,
3459 "character mapping must return integer, None or unicode");
3460 Py_DECREF(x);
3461 goto onError;
3462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003464 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 }
3467 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003468 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 Py_XDECREF(errorHandler);
3471 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003473
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 Py_XDECREF(errorHandler);
3476 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477 Py_XDECREF(v);
3478 return NULL;
3479}
3480
Martin v. Löwis3f767792006-06-04 19:36:28 +00003481/* Charmap encoding: the lookup table */
3482
3483struct encoding_map{
3484 PyObject_HEAD
3485 unsigned char level1[32];
3486 int count2, count3;
3487 unsigned char level23[1];
3488};
3489
3490static PyObject*
3491encoding_map_size(PyObject *obj, PyObject* args)
3492{
3493 struct encoding_map *map = (struct encoding_map*)obj;
3494 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3495 128*map->count3);
3496}
3497
3498static PyMethodDef encoding_map_methods[] = {
3499 {"size", encoding_map_size, METH_NOARGS,
3500 PyDoc_STR("Return the size (in bytes) of this object") },
3501 { 0 }
3502};
3503
3504static void
3505encoding_map_dealloc(PyObject* o)
3506{
3507 PyObject_FREE(o);
3508}
3509
3510static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00003511 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003512 "EncodingMap", /*tp_name*/
3513 sizeof(struct encoding_map), /*tp_basicsize*/
3514 0, /*tp_itemsize*/
3515 /* methods */
3516 encoding_map_dealloc, /*tp_dealloc*/
3517 0, /*tp_print*/
3518 0, /*tp_getattr*/
3519 0, /*tp_setattr*/
3520 0, /*tp_compare*/
3521 0, /*tp_repr*/
3522 0, /*tp_as_number*/
3523 0, /*tp_as_sequence*/
3524 0, /*tp_as_mapping*/
3525 0, /*tp_hash*/
3526 0, /*tp_call*/
3527 0, /*tp_str*/
3528 0, /*tp_getattro*/
3529 0, /*tp_setattro*/
3530 0, /*tp_as_buffer*/
3531 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3532 0, /*tp_doc*/
3533 0, /*tp_traverse*/
3534 0, /*tp_clear*/
3535 0, /*tp_richcompare*/
3536 0, /*tp_weaklistoffset*/
3537 0, /*tp_iter*/
3538 0, /*tp_iternext*/
3539 encoding_map_methods, /*tp_methods*/
3540 0, /*tp_members*/
3541 0, /*tp_getset*/
3542 0, /*tp_base*/
3543 0, /*tp_dict*/
3544 0, /*tp_descr_get*/
3545 0, /*tp_descr_set*/
3546 0, /*tp_dictoffset*/
3547 0, /*tp_init*/
3548 0, /*tp_alloc*/
3549 0, /*tp_new*/
3550 0, /*tp_free*/
3551 0, /*tp_is_gc*/
3552};
3553
3554PyObject*
3555PyUnicode_BuildEncodingMap(PyObject* string)
3556{
3557 Py_UNICODE *decode;
3558 PyObject *result;
3559 struct encoding_map *mresult;
3560 int i;
3561 int need_dict = 0;
3562 unsigned char level1[32];
3563 unsigned char level2[512];
3564 unsigned char *mlevel1, *mlevel2, *mlevel3;
3565 int count2 = 0, count3 = 0;
3566
3567 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3568 PyErr_BadArgument();
3569 return NULL;
3570 }
3571 decode = PyUnicode_AS_UNICODE(string);
3572 memset(level1, 0xFF, sizeof level1);
3573 memset(level2, 0xFF, sizeof level2);
3574
3575 /* If there isn't a one-to-one mapping of NULL to \0,
3576 or if there are non-BMP characters, we need to use
3577 a mapping dictionary. */
3578 if (decode[0] != 0)
3579 need_dict = 1;
3580 for (i = 1; i < 256; i++) {
3581 int l1, l2;
3582 if (decode[i] == 0
3583 #ifdef Py_UNICODE_WIDE
3584 || decode[i] > 0xFFFF
3585 #endif
3586 ) {
3587 need_dict = 1;
3588 break;
3589 }
3590 if (decode[i] == 0xFFFE)
3591 /* unmapped character */
3592 continue;
3593 l1 = decode[i] >> 11;
3594 l2 = decode[i] >> 7;
3595 if (level1[l1] == 0xFF)
3596 level1[l1] = count2++;
3597 if (level2[l2] == 0xFF)
3598 level2[l2] = count3++;
3599 }
3600
3601 if (count2 >= 0xFF || count3 >= 0xFF)
3602 need_dict = 1;
3603
3604 if (need_dict) {
3605 PyObject *result = PyDict_New();
3606 PyObject *key, *value;
3607 if (!result)
3608 return NULL;
3609 for (i = 0; i < 256; i++) {
3610 key = value = NULL;
3611 key = PyInt_FromLong(decode[i]);
3612 value = PyInt_FromLong(i);
3613 if (!key || !value)
3614 goto failed1;
3615 if (PyDict_SetItem(result, key, value) == -1)
3616 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003617 Py_DECREF(key);
3618 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003619 }
3620 return result;
3621 failed1:
3622 Py_XDECREF(key);
3623 Py_XDECREF(value);
3624 Py_DECREF(result);
3625 return NULL;
3626 }
3627
3628 /* Create a three-level trie */
3629 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3630 16*count2 + 128*count3 - 1);
3631 if (!result)
3632 return PyErr_NoMemory();
3633 PyObject_Init(result, &EncodingMapType);
3634 mresult = (struct encoding_map*)result;
3635 mresult->count2 = count2;
3636 mresult->count3 = count3;
3637 mlevel1 = mresult->level1;
3638 mlevel2 = mresult->level23;
3639 mlevel3 = mresult->level23 + 16*count2;
3640 memcpy(mlevel1, level1, 32);
3641 memset(mlevel2, 0xFF, 16*count2);
3642 memset(mlevel3, 0, 128*count3);
3643 count3 = 0;
3644 for (i = 1; i < 256; i++) {
3645 int o1, o2, o3, i2, i3;
3646 if (decode[i] == 0xFFFE)
3647 /* unmapped character */
3648 continue;
3649 o1 = decode[i]>>11;
3650 o2 = (decode[i]>>7) & 0xF;
3651 i2 = 16*mlevel1[o1] + o2;
3652 if (mlevel2[i2] == 0xFF)
3653 mlevel2[i2] = count3++;
3654 o3 = decode[i] & 0x7F;
3655 i3 = 128*mlevel2[i2] + o3;
3656 mlevel3[i3] = i;
3657 }
3658 return result;
3659}
3660
3661static int
3662encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3663{
3664 struct encoding_map *map = (struct encoding_map*)mapping;
3665 int l1 = c>>11;
3666 int l2 = (c>>7) & 0xF;
3667 int l3 = c & 0x7F;
3668 int i;
3669
3670#ifdef Py_UNICODE_WIDE
3671 if (c > 0xFFFF) {
3672 return -1;
3673 }
3674#endif
3675 if (c == 0)
3676 return 0;
3677 /* level 1*/
3678 i = map->level1[l1];
3679 if (i == 0xFF) {
3680 return -1;
3681 }
3682 /* level 2*/
3683 i = map->level23[16*i+l2];
3684 if (i == 0xFF) {
3685 return -1;
3686 }
3687 /* level 3 */
3688 i = map->level23[16*map->count2 + 128*i + l3];
3689 if (i == 0) {
3690 return -1;
3691 }
3692 return i;
3693}
3694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695/* Lookup the character ch in the mapping. If the character
3696 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003697 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700 PyObject *w = PyInt_FromLong((long)c);
3701 PyObject *x;
3702
3703 if (w == NULL)
3704 return NULL;
3705 x = PyObject_GetItem(mapping, w);
3706 Py_DECREF(w);
3707 if (x == NULL) {
3708 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3709 /* No mapping found means: mapping is undefined. */
3710 PyErr_Clear();
3711 x = Py_None;
3712 Py_INCREF(x);
3713 return x;
3714 } else
3715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003717 else if (x == Py_None)
3718 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 else if (PyInt_Check(x)) {
3720 long value = PyInt_AS_LONG(x);
3721 if (value < 0 || value > 255) {
3722 PyErr_SetString(PyExc_TypeError,
3723 "character mapping must be in range(256)");
3724 Py_DECREF(x);
3725 return NULL;
3726 }
3727 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 else if (PyString_Check(x))
3730 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 /* wrong return value */
3733 PyErr_SetString(PyExc_TypeError,
3734 "character mapping must return integer, None or str");
3735 Py_DECREF(x);
3736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 }
3738}
3739
Martin v. Löwis3f767792006-06-04 19:36:28 +00003740static int
3741charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3742{
3743 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3744 /* exponentially overallocate to minimize reallocations */
3745 if (requiredsize < 2*outsize)
3746 requiredsize = 2*outsize;
3747 if (_PyString_Resize(outobj, requiredsize)) {
3748 return 0;
3749 }
3750 return 1;
3751}
3752
3753typedef enum charmapencode_result {
3754 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3755}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756/* lookup the character, put the result in the output string and adjust
3757 various state variables. Reallocate the output string if not enough
3758 space is available. Return a new reference to the object that
3759 was put in the output buffer, or Py_None, if the mapping was undefined
3760 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003761 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003763charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003764 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003766 PyObject *rep;
3767 char *outstart;
3768 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769
Martin v. Löwis68192102007-07-21 06:55:02 +00003770 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003771 int res = encoding_map_lookup(c, mapping);
3772 Py_ssize_t requiredsize = *outpos+1;
3773 if (res == -1)
3774 return enc_FAILED;
3775 if (outsize<requiredsize)
3776 if (!charmapencode_resize(outobj, outpos, requiredsize))
3777 return enc_EXCEPTION;
3778 outstart = PyString_AS_STRING(*outobj);
3779 outstart[(*outpos)++] = (char)res;
3780 return enc_SUCCESS;
3781 }
3782
3783 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003785 return enc_EXCEPTION;
3786 else if (rep==Py_None) {
3787 Py_DECREF(rep);
3788 return enc_FAILED;
3789 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003790 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003791 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003792 if (outsize<requiredsize)
3793 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003795 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003797 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3799 }
3800 else {
3801 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003802 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3803 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003804 if (outsize<requiredsize)
3805 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003807 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003809 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 memcpy(outstart + *outpos, repchars, repsize);
3811 *outpos += repsize;
3812 }
3813 }
Georg Brandl9f167602006-06-04 21:46:16 +00003814 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003815 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816}
3817
3818/* handle an error in PyUnicode_EncodeCharmap
3819 Return 0 on success, -1 on error */
3820static
3821int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003822 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003824 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003825 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826{
3827 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003828 Py_ssize_t repsize;
3829 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 Py_UNICODE *uni2;
3831 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003832 Py_ssize_t collstartpos = *inpos;
3833 Py_ssize_t collendpos = *inpos+1;
3834 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 char *encoding = "charmap";
3836 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003837 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 /* find all unencodable characters */
3840 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003841 PyObject *rep;
Martin v. Löwis68192102007-07-21 06:55:02 +00003842 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003843 int res = encoding_map_lookup(p[collendpos], mapping);
3844 if (res != -1)
3845 break;
3846 ++collendpos;
3847 continue;
3848 }
3849
3850 rep = charmapencode_lookup(p[collendpos], mapping);
3851 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003853 else if (rep!=Py_None) {
3854 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855 break;
3856 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003857 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858 ++collendpos;
3859 }
3860 /* cache callback name lookup
3861 * (if not done yet, i.e. it's the first error) */
3862 if (*known_errorHandler==-1) {
3863 if ((errors==NULL) || (!strcmp(errors, "strict")))
3864 *known_errorHandler = 1;
3865 else if (!strcmp(errors, "replace"))
3866 *known_errorHandler = 2;
3867 else if (!strcmp(errors, "ignore"))
3868 *known_errorHandler = 3;
3869 else if (!strcmp(errors, "xmlcharrefreplace"))
3870 *known_errorHandler = 4;
3871 else
3872 *known_errorHandler = 0;
3873 }
3874 switch (*known_errorHandler) {
3875 case 1: /* strict */
3876 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3877 return -1;
3878 case 2: /* replace */
3879 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3880 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003881 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 return -1;
3883 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003884 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3886 return -1;
3887 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 }
3889 /* fall through */
3890 case 3: /* ignore */
3891 *inpos = collendpos;
3892 break;
3893 case 4: /* xmlcharrefreplace */
3894 /* generate replacement (temporarily (mis)uses p) */
3895 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3896 char buffer[2+29+1+1];
3897 char *cp;
3898 sprintf(buffer, "&#%d;", (int)p[collpos]);
3899 for (cp = buffer; *cp; ++cp) {
3900 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003901 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003902 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003903 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3905 return -1;
3906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 }
3908 }
3909 *inpos = collendpos;
3910 break;
3911 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003912 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 encoding, reason, p, size, exceptionObject,
3914 collstartpos, collendpos, &newpos);
3915 if (repunicode == NULL)
3916 return -1;
3917 /* generate replacement */
3918 repsize = PyUnicode_GET_SIZE(repunicode);
3919 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3920 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003921 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 return -1;
3923 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003924 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3927 return -1;
3928 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 }
3930 *inpos = newpos;
3931 Py_DECREF(repunicode);
3932 }
3933 return 0;
3934}
3935
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003937 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 PyObject *mapping,
3939 const char *errors)
3940{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 /* output object */
3942 PyObject *res = NULL;
3943 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003944 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 PyObject *errorHandler = NULL;
3948 PyObject *exc = NULL;
3949 /* the following variable is used for caching string comparisons
3950 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3951 * 3=ignore, 4=xmlcharrefreplace */
3952 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953
3954 /* Default to Latin-1 */
3955 if (mapping == NULL)
3956 return PyUnicode_EncodeLatin1(p, size, errors);
3957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 /* allocate enough for a simple encoding without
3959 replacements, if we need more, we'll resize */
3960 res = PyString_FromStringAndSize(NULL, size);
3961 if (res == NULL)
3962 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003963 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 while (inpos<size) {
3967 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003968 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3969 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003971 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 if (charmap_encoding_error(p, size, &inpos, mapping,
3973 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003974 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003975 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003976 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 else
3980 /* done with this character => adjust input position */
3981 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 /* Resize if we allocated to much */
3985 if (respos<PyString_GET_SIZE(res)) {
3986 if (_PyString_Resize(&res, respos))
3987 goto onError;
3988 }
3989 Py_XDECREF(exc);
3990 Py_XDECREF(errorHandler);
3991 return res;
3992
3993 onError:
3994 Py_XDECREF(res);
3995 Py_XDECREF(exc);
3996 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 return NULL;
3998}
3999
4000PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4001 PyObject *mapping)
4002{
4003 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4004 PyErr_BadArgument();
4005 return NULL;
4006 }
4007 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4008 PyUnicode_GET_SIZE(unicode),
4009 mapping,
4010 NULL);
4011}
4012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013/* create or adjust a UnicodeTranslateError */
4014static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004015 const Py_UNICODE *unicode, Py_ssize_t size,
4016 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 if (*exceptionObject == NULL) {
4020 *exceptionObject = PyUnicodeTranslateError_Create(
4021 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 }
4023 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4025 goto onError;
4026 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4027 goto onError;
4028 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4029 goto onError;
4030 return;
4031 onError:
4032 Py_DECREF(*exceptionObject);
4033 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 }
4035}
4036
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037/* raises a UnicodeTranslateError */
4038static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004039 const Py_UNICODE *unicode, Py_ssize_t size,
4040 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 const char *reason)
4042{
4043 make_translate_exception(exceptionObject,
4044 unicode, size, startpos, endpos, reason);
4045 if (*exceptionObject != NULL)
4046 PyCodec_StrictErrors(*exceptionObject);
4047}
4048
4049/* error handling callback helper:
4050 build arguments, call the callback and check the arguments,
4051 put the result into newpos and return the replacement string, which
4052 has to be freed by the caller */
4053static PyObject *unicode_translate_call_errorhandler(const char *errors,
4054 PyObject **errorHandler,
4055 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004056 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4057 Py_ssize_t startpos, Py_ssize_t endpos,
4058 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004060 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061
Martin v. Löwis412fb672006-04-13 06:34:32 +00004062 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 PyObject *restuple;
4064 PyObject *resunicode;
4065
4066 if (*errorHandler == NULL) {
4067 *errorHandler = PyCodec_LookupError(errors);
4068 if (*errorHandler == NULL)
4069 return NULL;
4070 }
4071
4072 make_translate_exception(exceptionObject,
4073 unicode, size, startpos, endpos, reason);
4074 if (*exceptionObject == NULL)
4075 return NULL;
4076
4077 restuple = PyObject_CallFunctionObjArgs(
4078 *errorHandler, *exceptionObject, NULL);
4079 if (restuple == NULL)
4080 return NULL;
4081 if (!PyTuple_Check(restuple)) {
4082 PyErr_Format(PyExc_TypeError, &argparse[4]);
4083 Py_DECREF(restuple);
4084 return NULL;
4085 }
4086 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004087 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 Py_DECREF(restuple);
4089 return NULL;
4090 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004091 if (i_newpos<0)
4092 *newpos = size+i_newpos;
4093 else
4094 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004095 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004096 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004097 Py_DECREF(restuple);
4098 return NULL;
4099 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100 Py_INCREF(resunicode);
4101 Py_DECREF(restuple);
4102 return resunicode;
4103}
4104
4105/* Lookup the character ch in the mapping and put the result in result,
4106 which must be decrefed by the caller.
4107 Return 0 on success, -1 on error */
4108static
4109int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4110{
4111 PyObject *w = PyInt_FromLong((long)c);
4112 PyObject *x;
4113
4114 if (w == NULL)
4115 return -1;
4116 x = PyObject_GetItem(mapping, w);
4117 Py_DECREF(w);
4118 if (x == NULL) {
4119 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4120 /* No mapping found means: use 1:1 mapping. */
4121 PyErr_Clear();
4122 *result = NULL;
4123 return 0;
4124 } else
4125 return -1;
4126 }
4127 else if (x == Py_None) {
4128 *result = x;
4129 return 0;
4130 }
4131 else if (PyInt_Check(x)) {
4132 long value = PyInt_AS_LONG(x);
4133 long max = PyUnicode_GetMax();
4134 if (value < 0 || value > max) {
4135 PyErr_Format(PyExc_TypeError,
4136 "character mapping must be in range(0x%lx)", max+1);
4137 Py_DECREF(x);
4138 return -1;
4139 }
4140 *result = x;
4141 return 0;
4142 }
4143 else if (PyUnicode_Check(x)) {
4144 *result = x;
4145 return 0;
4146 }
4147 else {
4148 /* wrong return value */
4149 PyErr_SetString(PyExc_TypeError,
4150 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004151 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 return -1;
4153 }
4154}
4155/* ensure that *outobj is at least requiredsize characters long,
4156if not reallocate and adjust various state variables.
4157Return 0 on success, -1 on error */
4158static
Walter Dörwald4894c302003-10-24 14:25:28 +00004159int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004160 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004163 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004165 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004167 if (requiredsize < 2 * oldsize)
4168 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004169 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 return -1;
4171 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 }
4173 return 0;
4174}
4175/* lookup the character, put the result in the output string and adjust
4176 various state variables. Return a new reference to the object that
4177 was put in the output buffer in *result, or Py_None, if the mapping was
4178 undefined (in which case no character was written).
4179 The called must decref result.
4180 Return 0 on success, -1 on error. */
4181static
Walter Dörwald4894c302003-10-24 14:25:28 +00004182int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004183 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004184 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185{
Walter Dörwald4894c302003-10-24 14:25:28 +00004186 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 return -1;
4188 if (*res==NULL) {
4189 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004190 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 }
4192 else if (*res==Py_None)
4193 ;
4194 else if (PyInt_Check(*res)) {
4195 /* no overflow check, because we know that the space is enough */
4196 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4197 }
4198 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004199 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 if (repsize==1) {
4201 /* no overflow check, because we know that the space is enough */
4202 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4203 }
4204 else if (repsize!=0) {
4205 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004206 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004207 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004208 repsize - 1;
4209 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 return -1;
4211 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4212 *outp += repsize;
4213 }
4214 }
4215 else
4216 return -1;
4217 return 0;
4218}
4219
4220PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004221 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 PyObject *mapping,
4223 const char *errors)
4224{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 /* output object */
4226 PyObject *res = NULL;
4227 /* pointers to the beginning and end+1 of input */
4228 const Py_UNICODE *startp = p;
4229 const Py_UNICODE *endp = p + size;
4230 /* pointer into the output */
4231 Py_UNICODE *str;
4232 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004233 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234 char *reason = "character maps to <undefined>";
4235 PyObject *errorHandler = NULL;
4236 PyObject *exc = NULL;
4237 /* the following variable is used for caching string comparisons
4238 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4239 * 3=ignore, 4=xmlcharrefreplace */
4240 int known_errorHandler = -1;
4241
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242 if (mapping == NULL) {
4243 PyErr_BadArgument();
4244 return NULL;
4245 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246
4247 /* allocate enough for a simple 1:1 translation without
4248 replacements, if we need more, we'll resize */
4249 res = PyUnicode_FromUnicode(NULL, size);
4250 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004251 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253 return res;
4254 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 while (p<endp) {
4257 /* try to encode it */
4258 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004259 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 goto onError;
4262 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004263 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 if (x!=Py_None) /* it worked => adjust input pointer */
4265 ++p;
4266 else { /* untranslatable character */
4267 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004268 Py_ssize_t repsize;
4269 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 Py_UNICODE *uni2;
4271 /* startpos for collecting untranslatable chars */
4272 const Py_UNICODE *collstart = p;
4273 const Py_UNICODE *collend = p+1;
4274 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 /* find all untranslatable characters */
4277 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004278 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 goto onError;
4280 Py_XDECREF(x);
4281 if (x!=Py_None)
4282 break;
4283 ++collend;
4284 }
4285 /* cache callback name lookup
4286 * (if not done yet, i.e. it's the first error) */
4287 if (known_errorHandler==-1) {
4288 if ((errors==NULL) || (!strcmp(errors, "strict")))
4289 known_errorHandler = 1;
4290 else if (!strcmp(errors, "replace"))
4291 known_errorHandler = 2;
4292 else if (!strcmp(errors, "ignore"))
4293 known_errorHandler = 3;
4294 else if (!strcmp(errors, "xmlcharrefreplace"))
4295 known_errorHandler = 4;
4296 else
4297 known_errorHandler = 0;
4298 }
4299 switch (known_errorHandler) {
4300 case 1: /* strict */
4301 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4302 goto onError;
4303 case 2: /* replace */
4304 /* No need to check for space, this is a 1:1 replacement */
4305 for (coll = collstart; coll<collend; ++coll)
4306 *str++ = '?';
4307 /* fall through */
4308 case 3: /* ignore */
4309 p = collend;
4310 break;
4311 case 4: /* xmlcharrefreplace */
4312 /* generate replacement (temporarily (mis)uses p) */
4313 for (p = collstart; p < collend; ++p) {
4314 char buffer[2+29+1+1];
4315 char *cp;
4316 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004317 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4319 goto onError;
4320 for (cp = buffer; *cp; ++cp)
4321 *str++ = *cp;
4322 }
4323 p = collend;
4324 break;
4325 default:
4326 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4327 reason, startp, size, &exc,
4328 collstart-startp, collend-startp, &newpos);
4329 if (repunicode == NULL)
4330 goto onError;
4331 /* generate replacement */
4332 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004333 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4335 Py_DECREF(repunicode);
4336 goto onError;
4337 }
4338 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4339 *str++ = *uni2;
4340 p = startp + newpos;
4341 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 }
4343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 /* Resize if we allocated to much */
4346 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004347 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004348 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004349 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 }
4351 Py_XDECREF(exc);
4352 Py_XDECREF(errorHandler);
4353 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 onError:
4356 Py_XDECREF(res);
4357 Py_XDECREF(exc);
4358 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359 return NULL;
4360}
4361
4362PyObject *PyUnicode_Translate(PyObject *str,
4363 PyObject *mapping,
4364 const char *errors)
4365{
4366 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004367
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 str = PyUnicode_FromObject(str);
4369 if (str == NULL)
4370 goto onError;
4371 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4372 PyUnicode_GET_SIZE(str),
4373 mapping,
4374 errors);
4375 Py_DECREF(str);
4376 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004377
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 onError:
4379 Py_XDECREF(str);
4380 return NULL;
4381}
Tim Petersced69f82003-09-16 20:30:58 +00004382
Guido van Rossum9e896b32000-04-05 20:11:21 +00004383/* --- Decimal Encoder ---------------------------------------------------- */
4384
4385int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004386 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004387 char *output,
4388 const char *errors)
4389{
4390 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 PyObject *errorHandler = NULL;
4392 PyObject *exc = NULL;
4393 const char *encoding = "decimal";
4394 const char *reason = "invalid decimal Unicode string";
4395 /* the following variable is used for caching string comparisons
4396 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4397 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004398
4399 if (output == NULL) {
4400 PyErr_BadArgument();
4401 return -1;
4402 }
4403
4404 p = s;
4405 end = s + length;
4406 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004408 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004410 Py_ssize_t repsize;
4411 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 Py_UNICODE *uni2;
4413 Py_UNICODE *collstart;
4414 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004415
Guido van Rossum9e896b32000-04-05 20:11:21 +00004416 if (Py_UNICODE_ISSPACE(ch)) {
4417 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004419 continue;
4420 }
4421 decimal = Py_UNICODE_TODECIMAL(ch);
4422 if (decimal >= 0) {
4423 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004425 continue;
4426 }
Guido van Rossumba477042000-04-06 18:18:10 +00004427 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004428 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004430 continue;
4431 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 /* All other characters are considered unencodable */
4433 collstart = p;
4434 collend = p+1;
4435 while (collend < end) {
4436 if ((0 < *collend && *collend < 256) ||
4437 !Py_UNICODE_ISSPACE(*collend) ||
4438 Py_UNICODE_TODECIMAL(*collend))
4439 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 /* cache callback name lookup
4442 * (if not done yet, i.e. it's the first error) */
4443 if (known_errorHandler==-1) {
4444 if ((errors==NULL) || (!strcmp(errors, "strict")))
4445 known_errorHandler = 1;
4446 else if (!strcmp(errors, "replace"))
4447 known_errorHandler = 2;
4448 else if (!strcmp(errors, "ignore"))
4449 known_errorHandler = 3;
4450 else if (!strcmp(errors, "xmlcharrefreplace"))
4451 known_errorHandler = 4;
4452 else
4453 known_errorHandler = 0;
4454 }
4455 switch (known_errorHandler) {
4456 case 1: /* strict */
4457 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4458 goto onError;
4459 case 2: /* replace */
4460 for (p = collstart; p < collend; ++p)
4461 *output++ = '?';
4462 /* fall through */
4463 case 3: /* ignore */
4464 p = collend;
4465 break;
4466 case 4: /* xmlcharrefreplace */
4467 /* generate replacement (temporarily (mis)uses p) */
4468 for (p = collstart; p < collend; ++p)
4469 output += sprintf(output, "&#%d;", (int)*p);
4470 p = collend;
4471 break;
4472 default:
4473 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4474 encoding, reason, s, length, &exc,
4475 collstart-s, collend-s, &newpos);
4476 if (repunicode == NULL)
4477 goto onError;
4478 /* generate replacement */
4479 repsize = PyUnicode_GET_SIZE(repunicode);
4480 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4481 Py_UNICODE ch = *uni2;
4482 if (Py_UNICODE_ISSPACE(ch))
4483 *output++ = ' ';
4484 else {
4485 decimal = Py_UNICODE_TODECIMAL(ch);
4486 if (decimal >= 0)
4487 *output++ = '0' + decimal;
4488 else if (0 < ch && ch < 256)
4489 *output++ = (char)ch;
4490 else {
4491 Py_DECREF(repunicode);
4492 raise_encode_exception(&exc, encoding,
4493 s, length, collstart-s, collend-s, reason);
4494 goto onError;
4495 }
4496 }
4497 }
4498 p = s + newpos;
4499 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004500 }
4501 }
4502 /* 0-terminate the output string */
4503 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 Py_XDECREF(exc);
4505 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004506 return 0;
4507
4508 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 Py_XDECREF(exc);
4510 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004511 return -1;
4512}
4513
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514/* --- Helpers ------------------------------------------------------------ */
4515
Fredrik Lundha50d2012006-05-26 17:04:58 +00004516#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004517
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004518#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004519#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004520#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004521
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004522Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004523STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4524{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004525 if (str[0] != other[0])
4526 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004527 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4528}
4529
Fredrik Lundhb9479482006-05-26 17:22:38 +00004530#define STRINGLIB_EMPTY unicode_empty
4531
Fredrik Lundha50d2012006-05-26 17:04:58 +00004532#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004533
4534#include "stringlib/count.h"
4535#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004536#include "stringlib/partition.h"
4537
Fredrik Lundhc8162812006-05-26 19:33:03 +00004538/* helper macro to fixup start/end slice values */
4539#define FIX_START_END(obj) \
4540 if (start < 0) \
4541 start += (obj)->length; \
4542 if (start < 0) \
4543 start = 0; \
4544 if (end > (obj)->length) \
4545 end = (obj)->length; \
4546 if (end < 0) \
4547 end += (obj)->length; \
4548 if (end < 0) \
4549 end = 0;
4550
Martin v. Löwis18e16552006-02-15 17:27:45 +00004551Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004552 PyObject *substr,
4553 Py_ssize_t start,
4554 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004557 PyUnicodeObject* str_obj;
4558 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004559
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004560 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4561 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004563 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4564 if (!sub_obj) {
4565 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566 return -1;
4567 }
Tim Petersced69f82003-09-16 20:30:58 +00004568
Fredrik Lundhc8162812006-05-26 19:33:03 +00004569 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004570
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004571 result = stringlib_count(
4572 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4573 );
4574
4575 Py_DECREF(sub_obj);
4576 Py_DECREF(str_obj);
4577
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 return result;
4579}
4580
Martin v. Löwis18e16552006-02-15 17:27:45 +00004581Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004582 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004583 Py_ssize_t start,
4584 Py_ssize_t end,
4585 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004587 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004588
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004589 str = PyUnicode_FromObject(str);
4590 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004591 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004592 sub = PyUnicode_FromObject(sub);
4593 if (!sub) {
4594 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004595 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 }
Tim Petersced69f82003-09-16 20:30:58 +00004597
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004598 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004599 result = stringlib_find_slice(
4600 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4601 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4602 start, end
4603 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004604 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004605 result = stringlib_rfind_slice(
4606 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4607 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4608 start, end
4609 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004610
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004611 Py_DECREF(str);
4612 Py_DECREF(sub);
4613
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 return result;
4615}
4616
Tim Petersced69f82003-09-16 20:30:58 +00004617static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618int tailmatch(PyUnicodeObject *self,
4619 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004620 Py_ssize_t start,
4621 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 int direction)
4623{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624 if (substring->length == 0)
4625 return 1;
4626
Fredrik Lundhc8162812006-05-26 19:33:03 +00004627 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
4629 end -= substring->length;
4630 if (end < start)
4631 return 0;
4632
4633 if (direction > 0) {
4634 if (Py_UNICODE_MATCH(self, end, substring))
4635 return 1;
4636 } else {
4637 if (Py_UNICODE_MATCH(self, start, substring))
4638 return 1;
4639 }
4640
4641 return 0;
4642}
4643
Martin v. Löwis18e16552006-02-15 17:27:45 +00004644Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646 Py_ssize_t start,
4647 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 int direction)
4649{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004650 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004651
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 str = PyUnicode_FromObject(str);
4653 if (str == NULL)
4654 return -1;
4655 substr = PyUnicode_FromObject(substr);
4656 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004657 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658 return -1;
4659 }
Tim Petersced69f82003-09-16 20:30:58 +00004660
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 result = tailmatch((PyUnicodeObject *)str,
4662 (PyUnicodeObject *)substr,
4663 start, end, direction);
4664 Py_DECREF(str);
4665 Py_DECREF(substr);
4666 return result;
4667}
4668
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669/* Apply fixfct filter to the Unicode object self and return a
4670 reference to the modified object */
4671
Tim Petersced69f82003-09-16 20:30:58 +00004672static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673PyObject *fixup(PyUnicodeObject *self,
4674 int (*fixfct)(PyUnicodeObject *s))
4675{
4676
4677 PyUnicodeObject *u;
4678
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004679 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 if (u == NULL)
4681 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004682
4683 Py_UNICODE_COPY(u->str, self->str, self->length);
4684
Tim Peters7a29bd52001-09-12 03:03:31 +00004685 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 /* fixfct should return TRUE if it modified the buffer. If
4687 FALSE, return a reference to the original buffer instead
4688 (to save space, not time) */
4689 Py_INCREF(self);
4690 Py_DECREF(u);
4691 return (PyObject*) self;
4692 }
4693 return (PyObject*) u;
4694}
4695
Tim Petersced69f82003-09-16 20:30:58 +00004696static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697int fixupper(PyUnicodeObject *self)
4698{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004699 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 Py_UNICODE *s = self->str;
4701 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004702
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 while (len-- > 0) {
4704 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004705
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706 ch = Py_UNICODE_TOUPPER(*s);
4707 if (ch != *s) {
4708 status = 1;
4709 *s = ch;
4710 }
4711 s++;
4712 }
4713
4714 return status;
4715}
4716
Tim Petersced69f82003-09-16 20:30:58 +00004717static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718int fixlower(PyUnicodeObject *self)
4719{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004720 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721 Py_UNICODE *s = self->str;
4722 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004723
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 while (len-- > 0) {
4725 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004726
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 ch = Py_UNICODE_TOLOWER(*s);
4728 if (ch != *s) {
4729 status = 1;
4730 *s = ch;
4731 }
4732 s++;
4733 }
4734
4735 return status;
4736}
4737
Tim Petersced69f82003-09-16 20:30:58 +00004738static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739int fixswapcase(PyUnicodeObject *self)
4740{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004741 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 Py_UNICODE *s = self->str;
4743 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004744
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 while (len-- > 0) {
4746 if (Py_UNICODE_ISUPPER(*s)) {
4747 *s = Py_UNICODE_TOLOWER(*s);
4748 status = 1;
4749 } else if (Py_UNICODE_ISLOWER(*s)) {
4750 *s = Py_UNICODE_TOUPPER(*s);
4751 status = 1;
4752 }
4753 s++;
4754 }
4755
4756 return status;
4757}
4758
Tim Petersced69f82003-09-16 20:30:58 +00004759static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760int fixcapitalize(PyUnicodeObject *self)
4761{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004762 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004763 Py_UNICODE *s = self->str;
4764 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004765
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004766 if (len == 0)
4767 return 0;
4768 if (Py_UNICODE_ISLOWER(*s)) {
4769 *s = Py_UNICODE_TOUPPER(*s);
4770 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004772 s++;
4773 while (--len > 0) {
4774 if (Py_UNICODE_ISUPPER(*s)) {
4775 *s = Py_UNICODE_TOLOWER(*s);
4776 status = 1;
4777 }
4778 s++;
4779 }
4780 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781}
4782
4783static
4784int fixtitle(PyUnicodeObject *self)
4785{
4786 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4787 register Py_UNICODE *e;
4788 int previous_is_cased;
4789
4790 /* Shortcut for single character strings */
4791 if (PyUnicode_GET_SIZE(self) == 1) {
4792 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4793 if (*p != ch) {
4794 *p = ch;
4795 return 1;
4796 }
4797 else
4798 return 0;
4799 }
Tim Petersced69f82003-09-16 20:30:58 +00004800
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 e = p + PyUnicode_GET_SIZE(self);
4802 previous_is_cased = 0;
4803 for (; p < e; p++) {
4804 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004805
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 if (previous_is_cased)
4807 *p = Py_UNICODE_TOLOWER(ch);
4808 else
4809 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004810
4811 if (Py_UNICODE_ISLOWER(ch) ||
4812 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 Py_UNICODE_ISTITLE(ch))
4814 previous_is_cased = 1;
4815 else
4816 previous_is_cased = 0;
4817 }
4818 return 1;
4819}
4820
Tim Peters8ce9f162004-08-27 01:49:32 +00004821PyObject *
4822PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823{
Tim Peters8ce9f162004-08-27 01:49:32 +00004824 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004825 const Py_UNICODE blank = ' ';
4826 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004827 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004828 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004829 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4830 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004831 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4832 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004833 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004834 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004835 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836
Tim Peters05eba1f2004-08-27 21:32:02 +00004837 fseq = PySequence_Fast(seq, "");
4838 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004839 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004840 }
4841
Tim Peters91879ab2004-08-27 22:35:44 +00004842 /* Grrrr. A codec may be invoked to convert str objects to
4843 * Unicode, and so it's possible to call back into Python code
4844 * during PyUnicode_FromObject(), and so it's possible for a sick
4845 * codec to change the size of fseq (if seq is a list). Therefore
4846 * we have to keep refetching the size -- can't assume seqlen
4847 * is invariant.
4848 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004849 seqlen = PySequence_Fast_GET_SIZE(fseq);
4850 /* If empty sequence, return u"". */
4851 if (seqlen == 0) {
4852 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4853 goto Done;
4854 }
4855 /* If singleton sequence with an exact Unicode, return that. */
4856 if (seqlen == 1) {
4857 item = PySequence_Fast_GET_ITEM(fseq, 0);
4858 if (PyUnicode_CheckExact(item)) {
4859 Py_INCREF(item);
4860 res = (PyUnicodeObject *)item;
4861 goto Done;
4862 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004863 }
4864
Tim Peters05eba1f2004-08-27 21:32:02 +00004865 /* At least two items to join, or one that isn't exact Unicode. */
4866 if (seqlen > 1) {
4867 /* Set up sep and seplen -- they're needed. */
4868 if (separator == NULL) {
4869 sep = &blank;
4870 seplen = 1;
4871 }
4872 else {
4873 internal_separator = PyUnicode_FromObject(separator);
4874 if (internal_separator == NULL)
4875 goto onError;
4876 sep = PyUnicode_AS_UNICODE(internal_separator);
4877 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004878 /* In case PyUnicode_FromObject() mutated seq. */
4879 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004880 }
4881 }
4882
4883 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004884 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004885 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004886 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004887 res_p = PyUnicode_AS_UNICODE(res);
4888 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004889
Tim Peters05eba1f2004-08-27 21:32:02 +00004890 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004891 Py_ssize_t itemlen;
4892 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004893
4894 item = PySequence_Fast_GET_ITEM(fseq, i);
4895 /* Convert item to Unicode. */
4896 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4897 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004898 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004899 " %.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +00004900 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004901 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004902 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004903 item = PyUnicode_FromObject(item);
4904 if (item == NULL)
4905 goto onError;
4906 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004907
Tim Peters91879ab2004-08-27 22:35:44 +00004908 /* In case PyUnicode_FromObject() mutated seq. */
4909 seqlen = PySequence_Fast_GET_SIZE(fseq);
4910
Tim Peters8ce9f162004-08-27 01:49:32 +00004911 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004913 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004914 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004915 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004916 if (i < seqlen - 1) {
4917 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004918 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004919 goto Overflow;
4920 }
4921 if (new_res_used > res_alloc) {
4922 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004923 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004924 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004925 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004926 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004927 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004928 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004929 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004931 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004932 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004934
4935 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004936 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004937 res_p += itemlen;
4938 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004939 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004940 res_p += seplen;
4941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004943 res_used = new_res_used;
4944 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004945
Tim Peters05eba1f2004-08-27 21:32:02 +00004946 /* Shrink res to match the used area; this probably can't fail,
4947 * but it's cheap to check.
4948 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004949 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004950 goto onError;
4951
4952 Done:
4953 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004954 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955 return (PyObject *)res;
4956
Tim Peters8ce9f162004-08-27 01:49:32 +00004957 Overflow:
4958 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004959 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004960 Py_DECREF(item);
4961 /* fall through */
4962
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004964 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004965 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004966 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 return NULL;
4968}
4969
Tim Petersced69f82003-09-16 20:30:58 +00004970static
4971PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004972 Py_ssize_t left,
4973 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 Py_UNICODE fill)
4975{
4976 PyUnicodeObject *u;
4977
4978 if (left < 0)
4979 left = 0;
4980 if (right < 0)
4981 right = 0;
4982
Tim Peters7a29bd52001-09-12 03:03:31 +00004983 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 Py_INCREF(self);
4985 return self;
4986 }
4987
4988 u = _PyUnicode_New(left + self->length + right);
4989 if (u) {
4990 if (left)
4991 Py_UNICODE_FILL(u->str, fill, left);
4992 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4993 if (right)
4994 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4995 }
4996
4997 return u;
4998}
4999
5000#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005001 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 if (!str) \
5003 goto onError; \
5004 if (PyList_Append(list, str)) { \
5005 Py_DECREF(str); \
5006 goto onError; \
5007 } \
5008 else \
5009 Py_DECREF(str);
5010
5011static
5012PyObject *split_whitespace(PyUnicodeObject *self,
5013 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005014 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005016 register Py_ssize_t i;
5017 register Py_ssize_t j;
5018 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019 PyObject *str;
5020
5021 for (i = j = 0; i < len; ) {
5022 /* find a token */
5023 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5024 i++;
5025 j = i;
5026 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5027 i++;
5028 if (j < i) {
5029 if (maxcount-- <= 0)
5030 break;
5031 SPLIT_APPEND(self->str, j, i);
5032 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5033 i++;
5034 j = i;
5035 }
5036 }
5037 if (j < len) {
5038 SPLIT_APPEND(self->str, j, len);
5039 }
5040 return list;
5041
5042 onError:
5043 Py_DECREF(list);
5044 return NULL;
5045}
5046
5047PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005048 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005050 register Py_ssize_t i;
5051 register Py_ssize_t j;
5052 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 PyObject *list;
5054 PyObject *str;
5055 Py_UNICODE *data;
5056
5057 string = PyUnicode_FromObject(string);
5058 if (string == NULL)
5059 return NULL;
5060 data = PyUnicode_AS_UNICODE(string);
5061 len = PyUnicode_GET_SIZE(string);
5062
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 list = PyList_New(0);
5064 if (!list)
5065 goto onError;
5066
5067 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005068 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005069
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005071 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073
5074 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005075 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 if (i < len) {
5077 if (data[i] == '\r' && i + 1 < len &&
5078 data[i+1] == '\n')
5079 i += 2;
5080 else
5081 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005082 if (keepends)
5083 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 }
Guido van Rossum86662912000-04-11 15:38:46 +00005085 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 j = i;
5087 }
5088 if (j < len) {
5089 SPLIT_APPEND(data, j, len);
5090 }
5091
5092 Py_DECREF(string);
5093 return list;
5094
5095 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005096 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 Py_DECREF(string);
5098 return NULL;
5099}
5100
Tim Petersced69f82003-09-16 20:30:58 +00005101static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102PyObject *split_char(PyUnicodeObject *self,
5103 PyObject *list,
5104 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005105 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005107 register Py_ssize_t i;
5108 register Py_ssize_t j;
5109 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 PyObject *str;
5111
5112 for (i = j = 0; i < len; ) {
5113 if (self->str[i] == ch) {
5114 if (maxcount-- <= 0)
5115 break;
5116 SPLIT_APPEND(self->str, j, i);
5117 i = j = i + 1;
5118 } else
5119 i++;
5120 }
5121 if (j <= len) {
5122 SPLIT_APPEND(self->str, j, len);
5123 }
5124 return list;
5125
5126 onError:
5127 Py_DECREF(list);
5128 return NULL;
5129}
5130
Tim Petersced69f82003-09-16 20:30:58 +00005131static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132PyObject *split_substring(PyUnicodeObject *self,
5133 PyObject *list,
5134 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005135 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137 register Py_ssize_t i;
5138 register Py_ssize_t j;
5139 Py_ssize_t len = self->length;
5140 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 PyObject *str;
5142
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005143 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144 if (Py_UNICODE_MATCH(self, i, substring)) {
5145 if (maxcount-- <= 0)
5146 break;
5147 SPLIT_APPEND(self->str, j, i);
5148 i = j = i + sublen;
5149 } else
5150 i++;
5151 }
5152 if (j <= len) {
5153 SPLIT_APPEND(self->str, j, len);
5154 }
5155 return list;
5156
5157 onError:
5158 Py_DECREF(list);
5159 return NULL;
5160}
5161
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005162static
5163PyObject *rsplit_whitespace(PyUnicodeObject *self,
5164 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005165 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005166{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167 register Py_ssize_t i;
5168 register Py_ssize_t j;
5169 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005170 PyObject *str;
5171
5172 for (i = j = len - 1; i >= 0; ) {
5173 /* find a token */
5174 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5175 i--;
5176 j = i;
5177 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5178 i--;
5179 if (j > i) {
5180 if (maxcount-- <= 0)
5181 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005182 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005183 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5184 i--;
5185 j = i;
5186 }
5187 }
5188 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005189 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005190 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005191 if (PyList_Reverse(list) < 0)
5192 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005193 return list;
5194
5195 onError:
5196 Py_DECREF(list);
5197 return NULL;
5198}
5199
5200static
5201PyObject *rsplit_char(PyUnicodeObject *self,
5202 PyObject *list,
5203 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 register Py_ssize_t i;
5207 register Py_ssize_t j;
5208 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005209 PyObject *str;
5210
5211 for (i = j = len - 1; i >= 0; ) {
5212 if (self->str[i] == ch) {
5213 if (maxcount-- <= 0)
5214 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005215 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005216 j = i = i - 1;
5217 } else
5218 i--;
5219 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005220 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005221 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005222 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005223 if (PyList_Reverse(list) < 0)
5224 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005225 return list;
5226
5227 onError:
5228 Py_DECREF(list);
5229 return NULL;
5230}
5231
5232static
5233PyObject *rsplit_substring(PyUnicodeObject *self,
5234 PyObject *list,
5235 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005236 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005237{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 register Py_ssize_t i;
5239 register Py_ssize_t j;
5240 Py_ssize_t len = self->length;
5241 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005242 PyObject *str;
5243
5244 for (i = len - sublen, j = len; i >= 0; ) {
5245 if (Py_UNICODE_MATCH(self, i, substring)) {
5246 if (maxcount-- <= 0)
5247 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005248 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005249 j = i;
5250 i -= sublen;
5251 } else
5252 i--;
5253 }
5254 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005255 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005256 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005257 if (PyList_Reverse(list) < 0)
5258 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005259 return list;
5260
5261 onError:
5262 Py_DECREF(list);
5263 return NULL;
5264}
5265
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266#undef SPLIT_APPEND
5267
5268static
5269PyObject *split(PyUnicodeObject *self,
5270 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005271 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272{
5273 PyObject *list;
5274
5275 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005276 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
5278 list = PyList_New(0);
5279 if (!list)
5280 return NULL;
5281
5282 if (substring == NULL)
5283 return split_whitespace(self,list,maxcount);
5284
5285 else if (substring->length == 1)
5286 return split_char(self,list,substring->str[0],maxcount);
5287
5288 else if (substring->length == 0) {
5289 Py_DECREF(list);
5290 PyErr_SetString(PyExc_ValueError, "empty separator");
5291 return NULL;
5292 }
5293 else
5294 return split_substring(self,list,substring,maxcount);
5295}
5296
Tim Petersced69f82003-09-16 20:30:58 +00005297static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005298PyObject *rsplit(PyUnicodeObject *self,
5299 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005300 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005301{
5302 PyObject *list;
5303
5304 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005305 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005306
5307 list = PyList_New(0);
5308 if (!list)
5309 return NULL;
5310
5311 if (substring == NULL)
5312 return rsplit_whitespace(self,list,maxcount);
5313
5314 else if (substring->length == 1)
5315 return rsplit_char(self,list,substring->str[0],maxcount);
5316
5317 else if (substring->length == 0) {
5318 Py_DECREF(list);
5319 PyErr_SetString(PyExc_ValueError, "empty separator");
5320 return NULL;
5321 }
5322 else
5323 return rsplit_substring(self,list,substring,maxcount);
5324}
5325
5326static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327PyObject *replace(PyUnicodeObject *self,
5328 PyUnicodeObject *str1,
5329 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005330 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331{
5332 PyUnicodeObject *u;
5333
5334 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005335 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336
Fredrik Lundh347ee272006-05-24 16:35:18 +00005337 if (str1->length == str2->length) {
5338 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005339 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005340 if (str1->length == 1) {
5341 /* replace characters */
5342 Py_UNICODE u1, u2;
5343 if (!findchar(self->str, self->length, str1->str[0]))
5344 goto nothing;
5345 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5346 if (!u)
5347 return NULL;
5348 Py_UNICODE_COPY(u->str, self->str, self->length);
5349 u1 = str1->str[0];
5350 u2 = str2->str[0];
5351 for (i = 0; i < u->length; i++)
5352 if (u->str[i] == u1) {
5353 if (--maxcount < 0)
5354 break;
5355 u->str[i] = u2;
5356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005358 i = fastsearch(
5359 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005361 if (i < 0)
5362 goto nothing;
5363 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5364 if (!u)
5365 return NULL;
5366 Py_UNICODE_COPY(u->str, self->str, self->length);
5367 while (i <= self->length - str1->length)
5368 if (Py_UNICODE_MATCH(self, i, str1)) {
5369 if (--maxcount < 0)
5370 break;
5371 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5372 i += str1->length;
5373 } else
5374 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005377
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005378 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005379 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 Py_UNICODE *p;
5381
5382 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005383 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 if (n > maxcount)
5385 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005386 if (n == 0)
5387 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005388 /* new_size = self->length + n * (str2->length - str1->length)); */
5389 delta = (str2->length - str1->length);
5390 if (delta == 0) {
5391 new_size = self->length;
5392 } else {
5393 product = n * (str2->length - str1->length);
5394 if ((product / (str2->length - str1->length)) != n) {
5395 PyErr_SetString(PyExc_OverflowError,
5396 "replace string is too long");
5397 return NULL;
5398 }
5399 new_size = self->length + product;
5400 if (new_size < 0) {
5401 PyErr_SetString(PyExc_OverflowError,
5402 "replace string is too long");
5403 return NULL;
5404 }
5405 }
5406 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005407 if (!u)
5408 return NULL;
5409 i = 0;
5410 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005411 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005412 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005413 while (n-- > 0) {
5414 /* look for next match */
5415 j = i;
5416 while (j <= e) {
5417 if (Py_UNICODE_MATCH(self, j, str1))
5418 break;
5419 j++;
5420 }
5421 if (j > i) {
5422 if (j > e)
5423 break;
5424 /* copy unchanged part [i:j] */
5425 Py_UNICODE_COPY(p, self->str+i, j-i);
5426 p += j - i;
5427 }
5428 /* copy substitution string */
5429 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005430 Py_UNICODE_COPY(p, str2->str, str2->length);
5431 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005432 }
5433 i = j + str1->length;
5434 }
5435 if (i < self->length)
5436 /* copy tail [i:] */
5437 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005438 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005439 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005440 while (n > 0) {
5441 Py_UNICODE_COPY(p, str2->str, str2->length);
5442 p += str2->length;
5443 if (--n <= 0)
5444 break;
5445 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005447 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 }
5449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005451
5452nothing:
5453 /* nothing to replace; return original string (when possible) */
5454 if (PyUnicode_CheckExact(self)) {
5455 Py_INCREF(self);
5456 return (PyObject *) self;
5457 }
5458 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459}
5460
5461/* --- Unicode Object Methods --------------------------------------------- */
5462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005463PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464"S.title() -> unicode\n\
5465\n\
5466Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005467characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468
5469static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005470unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 return fixup(self, fixtitle);
5473}
5474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005475PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476"S.capitalize() -> unicode\n\
5477\n\
5478Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005479have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480
5481static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005482unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 return fixup(self, fixcapitalize);
5485}
5486
5487#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005488PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489"S.capwords() -> unicode\n\
5490\n\
5491Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005492normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
5494static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005495unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496{
5497 PyObject *list;
5498 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005499 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 /* Split into words */
5502 list = split(self, NULL, -1);
5503 if (!list)
5504 return NULL;
5505
5506 /* Capitalize each word */
5507 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5508 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5509 fixcapitalize);
5510 if (item == NULL)
5511 goto onError;
5512 Py_DECREF(PyList_GET_ITEM(list, i));
5513 PyList_SET_ITEM(list, i, item);
5514 }
5515
5516 /* Join the words to form a new string */
5517 item = PyUnicode_Join(NULL, list);
5518
5519onError:
5520 Py_DECREF(list);
5521 return (PyObject *)item;
5522}
5523#endif
5524
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005525/* Argument converter. Coerces to a single unicode character */
5526
5527static int
5528convert_uc(PyObject *obj, void *addr)
5529{
5530 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5531 PyObject *uniobj;
5532 Py_UNICODE *unistr;
5533
5534 uniobj = PyUnicode_FromObject(obj);
5535 if (uniobj == NULL) {
5536 PyErr_SetString(PyExc_TypeError,
5537 "The fill character cannot be converted to Unicode");
5538 return 0;
5539 }
5540 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5541 PyErr_SetString(PyExc_TypeError,
5542 "The fill character must be exactly one character long");
5543 Py_DECREF(uniobj);
5544 return 0;
5545 }
5546 unistr = PyUnicode_AS_UNICODE(uniobj);
5547 *fillcharloc = unistr[0];
5548 Py_DECREF(uniobj);
5549 return 1;
5550}
5551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005552PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005553"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005555Return S centered in a Unicode string of length width. Padding is\n\
5556done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558static PyObject *
5559unicode_center(PyUnicodeObject *self, PyObject *args)
5560{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005561 Py_ssize_t marg, left;
5562 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005563 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
Thomas Woutersde017742006-02-16 19:34:37 +00005565 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 return NULL;
5567
Tim Peters7a29bd52001-09-12 03:03:31 +00005568 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 Py_INCREF(self);
5570 return (PyObject*) self;
5571 }
5572
5573 marg = width - self->length;
5574 left = marg / 2 + (marg & width & 1);
5575
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005576 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577}
5578
Marc-André Lemburge5034372000-08-08 08:04:29 +00005579#if 0
5580
5581/* This code should go into some future Unicode collation support
5582 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005583 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005584
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005585/* speedy UTF-16 code point order comparison */
5586/* gleaned from: */
5587/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5588
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005589static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005590{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005591 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005592 0, 0, 0, 0, 0, 0, 0, 0,
5593 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005594 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005595};
5596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597static int
5598unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5599{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005600 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005601
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 Py_UNICODE *s1 = str1->str;
5603 Py_UNICODE *s2 = str2->str;
5604
5605 len1 = str1->length;
5606 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005607
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005609 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005610
5611 c1 = *s1++;
5612 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005613
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005614 if (c1 > (1<<11) * 26)
5615 c1 += utf16Fixup[c1>>11];
5616 if (c2 > (1<<11) * 26)
5617 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005618 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005619
5620 if (c1 != c2)
5621 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005622
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005623 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 }
5625
5626 return (len1 < len2) ? -1 : (len1 != len2);
5627}
5628
Marc-André Lemburge5034372000-08-08 08:04:29 +00005629#else
5630
5631static int
5632unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5633{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005634 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005635
5636 Py_UNICODE *s1 = str1->str;
5637 Py_UNICODE *s2 = str2->str;
5638
5639 len1 = str1->length;
5640 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005641
Marc-André Lemburge5034372000-08-08 08:04:29 +00005642 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005643 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005644
Fredrik Lundh45714e92001-06-26 16:39:36 +00005645 c1 = *s1++;
5646 c2 = *s2++;
5647
5648 if (c1 != c2)
5649 return (c1 < c2) ? -1 : 1;
5650
Marc-André Lemburge5034372000-08-08 08:04:29 +00005651 len1--; len2--;
5652 }
5653
5654 return (len1 < len2) ? -1 : (len1 != len2);
5655}
5656
5657#endif
5658
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659int PyUnicode_Compare(PyObject *left,
5660 PyObject *right)
5661{
5662 PyUnicodeObject *u = NULL, *v = NULL;
5663 int result;
5664
5665 /* Coerce the two arguments */
5666 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5667 if (u == NULL)
5668 goto onError;
5669 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5670 if (v == NULL)
5671 goto onError;
5672
Thomas Wouters7e474022000-07-16 12:04:32 +00005673 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 if (v == u) {
5675 Py_DECREF(u);
5676 Py_DECREF(v);
5677 return 0;
5678 }
5679
5680 result = unicode_compare(u, v);
5681
5682 Py_DECREF(u);
5683 Py_DECREF(v);
5684 return result;
5685
5686onError:
5687 Py_XDECREF(u);
5688 Py_XDECREF(v);
5689 return -1;
5690}
5691
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005692PyObject *PyUnicode_RichCompare(PyObject *left,
5693 PyObject *right,
5694 int op)
5695{
5696 int result;
5697
5698 result = PyUnicode_Compare(left, right);
5699 if (result == -1 && PyErr_Occurred())
5700 goto onError;
5701
5702 /* Convert the return value to a Boolean */
5703 switch (op) {
5704 case Py_EQ:
5705 result = (result == 0);
5706 break;
5707 case Py_NE:
5708 result = (result != 0);
5709 break;
5710 case Py_LE:
5711 result = (result <= 0);
5712 break;
5713 case Py_GE:
5714 result = (result >= 0);
5715 break;
5716 case Py_LT:
5717 result = (result == -1);
5718 break;
5719 case Py_GT:
5720 result = (result == 1);
5721 break;
5722 }
5723 return PyBool_FromLong(result);
5724
5725 onError:
5726
5727 /* Standard case
5728
5729 Type errors mean that PyUnicode_FromObject() could not convert
5730 one of the arguments (usually the right hand side) to Unicode,
5731 ie. we can't handle the comparison request. However, it is
5732 possible that the other object knows a comparison method, which
5733 is why we return Py_NotImplemented to give the other object a
5734 chance.
5735
5736 */
5737 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5738 PyErr_Clear();
5739 Py_INCREF(Py_NotImplemented);
5740 return Py_NotImplemented;
5741 }
5742 if (op != Py_EQ && op != Py_NE)
5743 return NULL;
5744
5745 /* Equality comparison.
5746
5747 This is a special case: we silence any PyExc_UnicodeDecodeError
5748 and instead turn it into a PyErr_UnicodeWarning.
5749
5750 */
5751 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5752 return NULL;
5753 PyErr_Clear();
5754 if (PyErr_Warn(PyExc_UnicodeWarning,
5755 (op == Py_EQ) ?
5756 "Unicode equal comparison "
5757 "failed to convert both arguments to Unicode - "
5758 "interpreting them as being unequal" :
5759 "Unicode unequal comparison "
5760 "failed to convert both arguments to Unicode - "
5761 "interpreting them as being unequal"
5762 ) < 0)
5763 return NULL;
5764 result = (op == Py_NE);
5765 return PyBool_FromLong(result);
5766}
5767
Guido van Rossum403d68b2000-03-13 15:55:09 +00005768int PyUnicode_Contains(PyObject *container,
5769 PyObject *element)
5770{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005771 PyObject *str, *sub;
5772 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005773
5774 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005775 sub = PyUnicode_FromObject(element);
5776 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005777 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005778 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005779 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005780 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005781
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005782 str = PyUnicode_FromObject(container);
5783 if (!str) {
5784 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005785 return -1;
5786 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005787
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005788 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005789
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005790 Py_DECREF(str);
5791 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005792
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005793 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005794}
5795
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796/* Concat to string or Unicode object giving a new Unicode object. */
5797
5798PyObject *PyUnicode_Concat(PyObject *left,
5799 PyObject *right)
5800{
5801 PyUnicodeObject *u = NULL, *v = NULL, *w;
5802
5803 /* Coerce the two arguments */
5804 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5805 if (u == NULL)
5806 goto onError;
5807 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5808 if (v == NULL)
5809 goto onError;
5810
5811 /* Shortcuts */
5812 if (v == unicode_empty) {
5813 Py_DECREF(v);
5814 return (PyObject *)u;
5815 }
5816 if (u == unicode_empty) {
5817 Py_DECREF(u);
5818 return (PyObject *)v;
5819 }
5820
5821 /* Concat the two Unicode strings */
5822 w = _PyUnicode_New(u->length + v->length);
5823 if (w == NULL)
5824 goto onError;
5825 Py_UNICODE_COPY(w->str, u->str, u->length);
5826 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5827
5828 Py_DECREF(u);
5829 Py_DECREF(v);
5830 return (PyObject *)w;
5831
5832onError:
5833 Py_XDECREF(u);
5834 Py_XDECREF(v);
5835 return NULL;
5836}
5837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005838PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839"S.count(sub[, start[, end]]) -> int\n\
5840\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005841Return the number of non-overlapping occurrences of substring sub in\n\
5842Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005843interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
5845static PyObject *
5846unicode_count(PyUnicodeObject *self, PyObject *args)
5847{
5848 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005849 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005850 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 PyObject *result;
5852
Guido van Rossumb8872e62000-05-09 14:14:27 +00005853 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5854 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 return NULL;
5856
5857 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005858 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 if (substring == NULL)
5860 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005861
Fredrik Lundhc8162812006-05-26 19:33:03 +00005862 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005864 result = PyInt_FromSsize_t(
5865 stringlib_count(self->str + start, end - start,
5866 substring->str, substring->length)
5867 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
5869 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005870
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 return result;
5872}
5873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005874PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005875"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005877Encodes S using the codec registered for encoding. encoding defaults\n\
5878to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005879handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5881'xmlcharrefreplace' as well as any other name registered with\n\
5882codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883
5884static PyObject *
5885unicode_encode(PyUnicodeObject *self, PyObject *args)
5886{
5887 char *encoding = NULL;
5888 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005889 PyObject *v;
5890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5892 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005893 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005894 if (v == NULL)
5895 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005896 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5897 PyErr_Format(PyExc_TypeError,
5898 "encoder did not return a string/unicode object "
5899 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005900 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005901 Py_DECREF(v);
5902 return NULL;
5903 }
5904 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005905
5906 onError:
5907 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005908}
5909
5910PyDoc_STRVAR(decode__doc__,
5911"S.decode([encoding[,errors]]) -> string or unicode\n\
5912\n\
5913Decodes S using the codec registered for encoding. encoding defaults\n\
5914to the default encoding. errors may be given to set a different error\n\
5915handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5916a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5917as well as any other name registerd with codecs.register_error that is\n\
5918able to handle UnicodeDecodeErrors.");
5919
5920static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005921unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005922{
5923 char *encoding = NULL;
5924 char *errors = NULL;
5925 PyObject *v;
5926
5927 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5928 return NULL;
5929 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005930 if (v == NULL)
5931 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005932 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5933 PyErr_Format(PyExc_TypeError,
5934 "decoder did not return a string/unicode object "
5935 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005936 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005937 Py_DECREF(v);
5938 return NULL;
5939 }
5940 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005941
5942 onError:
5943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944}
5945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005946PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947"S.expandtabs([tabsize]) -> unicode\n\
5948\n\
5949Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005950If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
5952static PyObject*
5953unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5954{
5955 Py_UNICODE *e;
5956 Py_UNICODE *p;
5957 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005958 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 PyUnicodeObject *u;
5960 int tabsize = 8;
5961
5962 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5963 return NULL;
5964
Thomas Wouters7e474022000-07-16 12:04:32 +00005965 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005966 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 e = self->str + self->length;
5968 for (p = self->str; p < e; p++)
5969 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005970 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005972 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005973 PyErr_SetString(PyExc_OverflowError,
5974 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005975 return NULL;
5976 }
5977 old_j = j;
5978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 }
5980 else {
5981 j++;
5982 if (*p == '\n' || *p == '\r') {
5983 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005984 old_j = j = 0;
5985 if (i < 0) {
5986 PyErr_SetString(PyExc_OverflowError,
5987 "new string is too long");
5988 return NULL;
5989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 }
5991 }
5992
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005993 if ((i + j) < 0) {
5994 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5995 return NULL;
5996 }
5997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* Second pass: create output string and fill it */
5999 u = _PyUnicode_New(i + j);
6000 if (!u)
6001 return NULL;
6002
6003 j = 0;
6004 q = u->str;
6005
6006 for (p = self->str; p < e; p++)
6007 if (*p == '\t') {
6008 if (tabsize > 0) {
6009 i = tabsize - (j % tabsize);
6010 j += i;
6011 while (i--)
6012 *q++ = ' ';
6013 }
6014 }
6015 else {
6016 j++;
6017 *q++ = *p;
6018 if (*p == '\n' || *p == '\r')
6019 j = 0;
6020 }
6021
6022 return (PyObject*) u;
6023}
6024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006025PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026"S.find(sub [,start [,end]]) -> int\n\
6027\n\
6028Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006029such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030arguments start and end are interpreted as in slice notation.\n\
6031\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006032Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
6034static PyObject *
6035unicode_find(PyUnicodeObject *self, PyObject *args)
6036{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006037 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006038 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006039 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006040 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
Guido van Rossumb8872e62000-05-09 14:14:27 +00006042 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6043 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006045 substring = PyUnicode_FromObject(substring);
6046 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 return NULL;
6048
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006049 result = stringlib_find_slice(
6050 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6051 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6052 start, end
6053 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054
6055 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006056
6057 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058}
6059
6060static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006061unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062{
6063 if (index < 0 || index >= self->length) {
6064 PyErr_SetString(PyExc_IndexError, "string index out of range");
6065 return NULL;
6066 }
6067
6068 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6069}
6070
6071static long
6072unicode_hash(PyUnicodeObject *self)
6073{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006074 /* Since Unicode objects compare equal to their ASCII string
6075 counterparts, they should use the individual character values
6076 as basis for their hash value. This is needed to assure that
6077 strings and Unicode objects behave in the same way as
6078 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006081 register Py_UNICODE *p;
6082 register long x;
6083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 if (self->hash != -1)
6085 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006086 len = PyUnicode_GET_SIZE(self);
6087 p = PyUnicode_AS_UNICODE(self);
6088 x = *p << 7;
6089 while (--len >= 0)
6090 x = (1000003*x) ^ *p++;
6091 x ^= PyUnicode_GET_SIZE(self);
6092 if (x == -1)
6093 x = -2;
6094 self->hash = x;
6095 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096}
6097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006098PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099"S.index(sub [,start [,end]]) -> int\n\
6100\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006101Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102
6103static PyObject *
6104unicode_index(PyUnicodeObject *self, PyObject *args)
6105{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006106 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006107 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006109 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
Guido van Rossumb8872e62000-05-09 14:14:27 +00006111 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6112 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006114 substring = PyUnicode_FromObject(substring);
6115 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 return NULL;
6117
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006118 result = stringlib_find_slice(
6119 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6120 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6121 start, end
6122 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
6124 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006125
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 if (result < 0) {
6127 PyErr_SetString(PyExc_ValueError, "substring not found");
6128 return NULL;
6129 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006130
Martin v. Löwis18e16552006-02-15 17:27:45 +00006131 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132}
6133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006134PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006135"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006137Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006138at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
6140static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006141unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
6143 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6144 register const Py_UNICODE *e;
6145 int cased;
6146
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 /* Shortcut for single character strings */
6148 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006149 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006151 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006152 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006153 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006154
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 e = p + PyUnicode_GET_SIZE(self);
6156 cased = 0;
6157 for (; p < e; p++) {
6158 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006161 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 else if (!cased && Py_UNICODE_ISLOWER(ch))
6163 cased = 1;
6164 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006165 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166}
6167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006168PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006169"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006171Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006172at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
6174static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006175unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176{
6177 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6178 register const Py_UNICODE *e;
6179 int cased;
6180
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 /* Shortcut for single character strings */
6182 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006183 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006185 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006186 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006187 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006188
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 e = p + PyUnicode_GET_SIZE(self);
6190 cased = 0;
6191 for (; p < e; p++) {
6192 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006195 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 else if (!cased && Py_UNICODE_ISUPPER(ch))
6197 cased = 1;
6198 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006199 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200}
6201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006202PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006203"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006205Return True if S is a titlecased string and there is at least one\n\
6206character in S, i.e. upper- and titlecase characters may only\n\
6207follow uncased characters and lowercase characters only cased ones.\n\
6208Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209
6210static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006211unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212{
6213 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6214 register const Py_UNICODE *e;
6215 int cased, previous_is_cased;
6216
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 /* Shortcut for single character strings */
6218 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006219 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6220 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006222 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006223 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006224 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006225
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 e = p + PyUnicode_GET_SIZE(self);
6227 cased = 0;
6228 previous_is_cased = 0;
6229 for (; p < e; p++) {
6230 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006231
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6233 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006234 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 previous_is_cased = 1;
6236 cased = 1;
6237 }
6238 else if (Py_UNICODE_ISLOWER(ch)) {
6239 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006240 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 previous_is_cased = 1;
6242 cased = 1;
6243 }
6244 else
6245 previous_is_cased = 0;
6246 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006247 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248}
6249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006250PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006251"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006253Return True if all characters in S are whitespace\n\
6254and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255
6256static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006257unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258{
6259 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6260 register const Py_UNICODE *e;
6261
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 /* Shortcut for single character strings */
6263 if (PyUnicode_GET_SIZE(self) == 1 &&
6264 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006265 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006267 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006268 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006269 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006270
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 e = p + PyUnicode_GET_SIZE(self);
6272 for (; p < e; p++) {
6273 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006274 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006276 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277}
6278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006279PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006280"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006281\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006282Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006283and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006284
6285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006286unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006287{
6288 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6289 register const Py_UNICODE *e;
6290
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006291 /* Shortcut for single character strings */
6292 if (PyUnicode_GET_SIZE(self) == 1 &&
6293 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006294 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006295
6296 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006297 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006298 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006299
6300 e = p + PyUnicode_GET_SIZE(self);
6301 for (; p < e; p++) {
6302 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006303 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006304 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006305 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006306}
6307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006308PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006309"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006310\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006311Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006312and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006313
6314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006315unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006316{
6317 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6318 register const Py_UNICODE *e;
6319
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006320 /* Shortcut for single character strings */
6321 if (PyUnicode_GET_SIZE(self) == 1 &&
6322 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006323 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006324
6325 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006326 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006327 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006328
6329 e = p + PyUnicode_GET_SIZE(self);
6330 for (; p < e; p++) {
6331 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006332 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006333 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006334 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006335}
6336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006337PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006338"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006340Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006341False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006344unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
6346 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6347 register const Py_UNICODE *e;
6348
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 /* Shortcut for single character strings */
6350 if (PyUnicode_GET_SIZE(self) == 1 &&
6351 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006352 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006354 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006355 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006356 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 e = p + PyUnicode_GET_SIZE(self);
6359 for (; p < e; p++) {
6360 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006361 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006363 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364}
6365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006366PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006367"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006369Return True if all characters in S are digits\n\
6370and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371
6372static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006373unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374{
6375 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6376 register const Py_UNICODE *e;
6377
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 /* Shortcut for single character strings */
6379 if (PyUnicode_GET_SIZE(self) == 1 &&
6380 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006381 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006383 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006384 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006385 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006386
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 e = p + PyUnicode_GET_SIZE(self);
6388 for (; p < e; p++) {
6389 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006390 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006392 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393}
6394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006395PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006396"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006398Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006399False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
6401static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006402unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403{
6404 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6405 register const Py_UNICODE *e;
6406
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 /* Shortcut for single character strings */
6408 if (PyUnicode_GET_SIZE(self) == 1 &&
6409 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006410 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006412 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006413 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006414 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006415
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 e = p + PyUnicode_GET_SIZE(self);
6417 for (; p < e; p++) {
6418 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006419 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006421 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422}
6423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006424PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425"S.join(sequence) -> unicode\n\
6426\n\
6427Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006428sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
6430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006431unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006433 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434}
6435
Martin v. Löwis18e16552006-02-15 17:27:45 +00006436static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437unicode_length(PyUnicodeObject *self)
6438{
6439 return self->length;
6440}
6441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006442PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006443"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444\n\
6445Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006446done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447
6448static PyObject *
6449unicode_ljust(PyUnicodeObject *self, PyObject *args)
6450{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006451 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006452 Py_UNICODE fillchar = ' ';
6453
Martin v. Löwis412fb672006-04-13 06:34:32 +00006454 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 return NULL;
6456
Tim Peters7a29bd52001-09-12 03:03:31 +00006457 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 Py_INCREF(self);
6459 return (PyObject*) self;
6460 }
6461
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006462 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463}
6464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006465PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466"S.lower() -> unicode\n\
6467\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006468Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469
6470static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006471unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 return fixup(self, fixlower);
6474}
6475
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006476#define LEFTSTRIP 0
6477#define RIGHTSTRIP 1
6478#define BOTHSTRIP 2
6479
6480/* Arrays indexed by above */
6481static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6482
6483#define STRIPNAME(i) (stripformat[i]+3)
6484
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006485/* externally visible for str.strip(unicode) */
6486PyObject *
6487_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6488{
6489 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006490 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006491 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6493 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006494
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006495 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6496
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006497 i = 0;
6498 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006499 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6500 i++;
6501 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006502 }
6503
6504 j = len;
6505 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006506 do {
6507 j--;
6508 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6509 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006510 }
6511
6512 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006513 Py_INCREF(self);
6514 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006515 }
6516 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006517 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006518}
6519
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
6521static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006522do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006524 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006525 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006526
6527 i = 0;
6528 if (striptype != RIGHTSTRIP) {
6529 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6530 i++;
6531 }
6532 }
6533
6534 j = len;
6535 if (striptype != LEFTSTRIP) {
6536 do {
6537 j--;
6538 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6539 j++;
6540 }
6541
6542 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6543 Py_INCREF(self);
6544 return (PyObject*)self;
6545 }
6546 else
6547 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548}
6549
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006550
6551static PyObject *
6552do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6553{
6554 PyObject *sep = NULL;
6555
6556 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6557 return NULL;
6558
6559 if (sep != NULL && sep != Py_None) {
6560 if (PyUnicode_Check(sep))
6561 return _PyUnicode_XStrip(self, striptype, sep);
6562 else if (PyString_Check(sep)) {
6563 PyObject *res;
6564 sep = PyUnicode_FromObject(sep);
6565 if (sep==NULL)
6566 return NULL;
6567 res = _PyUnicode_XStrip(self, striptype, sep);
6568 Py_DECREF(sep);
6569 return res;
6570 }
6571 else {
6572 PyErr_Format(PyExc_TypeError,
6573 "%s arg must be None, unicode or str",
6574 STRIPNAME(striptype));
6575 return NULL;
6576 }
6577 }
6578
6579 return do_strip(self, striptype);
6580}
6581
6582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006583PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006584"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006585\n\
6586Return a copy of the string S with leading and trailing\n\
6587whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006588If chars is given and not None, remove characters in chars instead.\n\
6589If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006590
6591static PyObject *
6592unicode_strip(PyUnicodeObject *self, PyObject *args)
6593{
6594 if (PyTuple_GET_SIZE(args) == 0)
6595 return do_strip(self, BOTHSTRIP); /* Common case */
6596 else
6597 return do_argstrip(self, BOTHSTRIP, args);
6598}
6599
6600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006601PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006602"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006603\n\
6604Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006605If chars is given and not None, remove characters in chars instead.\n\
6606If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006607
6608static PyObject *
6609unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6610{
6611 if (PyTuple_GET_SIZE(args) == 0)
6612 return do_strip(self, LEFTSTRIP); /* Common case */
6613 else
6614 return do_argstrip(self, LEFTSTRIP, args);
6615}
6616
6617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006618PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006619"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006620\n\
6621Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006622If chars is given and not None, remove characters in chars instead.\n\
6623If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006624
6625static PyObject *
6626unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6627{
6628 if (PyTuple_GET_SIZE(args) == 0)
6629 return do_strip(self, RIGHTSTRIP); /* Common case */
6630 else
6631 return do_argstrip(self, RIGHTSTRIP, args);
6632}
6633
6634
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006636unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637{
6638 PyUnicodeObject *u;
6639 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006641 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
6643 if (len < 0)
6644 len = 0;
6645
Tim Peters7a29bd52001-09-12 03:03:31 +00006646 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 /* no repeat, return original string */
6648 Py_INCREF(str);
6649 return (PyObject*) str;
6650 }
Tim Peters8f422462000-09-09 06:13:41 +00006651
6652 /* ensure # of chars needed doesn't overflow int and # of bytes
6653 * needed doesn't overflow size_t
6654 */
6655 nchars = len * str->length;
6656 if (len && nchars / len != str->length) {
6657 PyErr_SetString(PyExc_OverflowError,
6658 "repeated string is too long");
6659 return NULL;
6660 }
6661 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6662 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6663 PyErr_SetString(PyExc_OverflowError,
6664 "repeated string is too long");
6665 return NULL;
6666 }
6667 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 if (!u)
6669 return NULL;
6670
6671 p = u->str;
6672
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006673 if (str->length == 1 && len > 0) {
6674 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006675 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006676 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006677 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006678 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006679 done = str->length;
6680 }
6681 while (done < nchars) {
6682 int n = (done <= nchars-done) ? done : nchars-done;
6683 Py_UNICODE_COPY(p+done, p, n);
6684 done += n;
6685 }
6686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
6688 return (PyObject*) u;
6689}
6690
6691PyObject *PyUnicode_Replace(PyObject *obj,
6692 PyObject *subobj,
6693 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695{
6696 PyObject *self;
6697 PyObject *str1;
6698 PyObject *str2;
6699 PyObject *result;
6700
6701 self = PyUnicode_FromObject(obj);
6702 if (self == NULL)
6703 return NULL;
6704 str1 = PyUnicode_FromObject(subobj);
6705 if (str1 == NULL) {
6706 Py_DECREF(self);
6707 return NULL;
6708 }
6709 str2 = PyUnicode_FromObject(replobj);
6710 if (str2 == NULL) {
6711 Py_DECREF(self);
6712 Py_DECREF(str1);
6713 return NULL;
6714 }
Tim Petersced69f82003-09-16 20:30:58 +00006715 result = replace((PyUnicodeObject *)self,
6716 (PyUnicodeObject *)str1,
6717 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 maxcount);
6719 Py_DECREF(self);
6720 Py_DECREF(str1);
6721 Py_DECREF(str2);
6722 return result;
6723}
6724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006725PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726"S.replace (old, new[, maxsplit]) -> unicode\n\
6727\n\
6728Return a copy of S with all occurrences of substring\n\
6729old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006730given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731
6732static PyObject*
6733unicode_replace(PyUnicodeObject *self, PyObject *args)
6734{
6735 PyUnicodeObject *str1;
6736 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006737 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 PyObject *result;
6739
Martin v. Löwis18e16552006-02-15 17:27:45 +00006740 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 return NULL;
6742 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6743 if (str1 == NULL)
6744 return NULL;
6745 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006746 if (str2 == NULL) {
6747 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750
6751 result = replace(self, str1, str2, maxcount);
6752
6753 Py_DECREF(str1);
6754 Py_DECREF(str2);
6755 return result;
6756}
6757
6758static
6759PyObject *unicode_repr(PyObject *unicode)
6760{
6761 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6762 PyUnicode_GET_SIZE(unicode),
6763 1);
6764}
6765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006766PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767"S.rfind(sub [,start [,end]]) -> int\n\
6768\n\
6769Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006770such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771arguments start and end are interpreted as in slice notation.\n\
6772\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006773Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774
6775static PyObject *
6776unicode_rfind(PyUnicodeObject *self, PyObject *args)
6777{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006778 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006779 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006780 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006781 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782
Guido van Rossumb8872e62000-05-09 14:14:27 +00006783 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6784 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006786 substring = PyUnicode_FromObject(substring);
6787 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 return NULL;
6789
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006790 result = stringlib_rfind_slice(
6791 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6792 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6793 start, end
6794 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795
6796 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006797
6798 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006801PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802"S.rindex(sub [,start [,end]]) -> int\n\
6803\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006804Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805
6806static PyObject *
6807unicode_rindex(PyUnicodeObject *self, PyObject *args)
6808{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006809 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006810 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006811 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006812 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813
Guido van Rossumb8872e62000-05-09 14:14:27 +00006814 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6815 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006817 substring = PyUnicode_FromObject(substring);
6818 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 return NULL;
6820
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006821 result = stringlib_rfind_slice(
6822 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6823 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6824 start, end
6825 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
6827 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006828
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 if (result < 0) {
6830 PyErr_SetString(PyExc_ValueError, "substring not found");
6831 return NULL;
6832 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006833 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834}
6835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006836PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006837"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838\n\
6839Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006840done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
6842static PyObject *
6843unicode_rjust(PyUnicodeObject *self, PyObject *args)
6844{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006845 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006846 Py_UNICODE fillchar = ' ';
6847
Martin v. Löwis412fb672006-04-13 06:34:32 +00006848 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 return NULL;
6850
Tim Peters7a29bd52001-09-12 03:03:31 +00006851 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 Py_INCREF(self);
6853 return (PyObject*) self;
6854 }
6855
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006856 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857}
6858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006860unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861{
6862 /* standard clamping */
6863 if (start < 0)
6864 start = 0;
6865 if (end < 0)
6866 end = 0;
6867 if (end > self->length)
6868 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006869 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 /* full slice, return original string */
6871 Py_INCREF(self);
6872 return (PyObject*) self;
6873 }
6874 if (start > end)
6875 start = end;
6876 /* copy slice */
6877 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6878 end - start);
6879}
6880
6881PyObject *PyUnicode_Split(PyObject *s,
6882 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006883 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884{
6885 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 s = PyUnicode_FromObject(s);
6888 if (s == NULL)
6889 return NULL;
6890 if (sep != NULL) {
6891 sep = PyUnicode_FromObject(sep);
6892 if (sep == NULL) {
6893 Py_DECREF(s);
6894 return NULL;
6895 }
6896 }
6897
6898 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6899
6900 Py_DECREF(s);
6901 Py_XDECREF(sep);
6902 return result;
6903}
6904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006905PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906"S.split([sep [,maxsplit]]) -> list of strings\n\
6907\n\
6908Return a list of the words in S, using sep as the\n\
6909delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006910splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006911any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912
6913static PyObject*
6914unicode_split(PyUnicodeObject *self, PyObject *args)
6915{
6916 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006917 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 return NULL;
6921
6922 if (substring == Py_None)
6923 return split(self, NULL, maxcount);
6924 else if (PyUnicode_Check(substring))
6925 return split(self, (PyUnicodeObject *)substring, maxcount);
6926 else
6927 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6928}
6929
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006930PyObject *
6931PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6932{
6933 PyObject* str_obj;
6934 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006935 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006936
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006937 str_obj = PyUnicode_FromObject(str_in);
6938 if (!str_obj)
6939 return NULL;
6940 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006941 if (!sep_obj) {
6942 Py_DECREF(str_obj);
6943 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006944 }
6945
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006946 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006947 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6948 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6949 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006950
Fredrik Lundhb9479482006-05-26 17:22:38 +00006951 Py_DECREF(sep_obj);
6952 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006953
6954 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006955}
6956
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006957
6958PyObject *
6959PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6960{
6961 PyObject* str_obj;
6962 PyObject* sep_obj;
6963 PyObject* out;
6964
6965 str_obj = PyUnicode_FromObject(str_in);
6966 if (!str_obj)
6967 return NULL;
6968 sep_obj = PyUnicode_FromObject(sep_in);
6969 if (!sep_obj) {
6970 Py_DECREF(str_obj);
6971 return NULL;
6972 }
6973
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006974 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006975 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6976 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6977 );
6978
6979 Py_DECREF(sep_obj);
6980 Py_DECREF(str_obj);
6981
6982 return out;
6983}
6984
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006985PyDoc_STRVAR(partition__doc__,
6986"S.partition(sep) -> (head, sep, tail)\n\
6987\n\
6988Searches for the separator sep in S, and returns the part before it,\n\
6989the separator itself, and the part after it. If the separator is not\n\
6990found, returns S and two empty strings.");
6991
6992static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006993unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006994{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006995 return PyUnicode_Partition((PyObject *)self, separator);
6996}
6997
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006998PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00006999"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007000\n\
7001Searches for the separator sep in S, starting at the end of S, and returns\n\
7002the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007003separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007004
7005static PyObject*
7006unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7007{
7008 return PyUnicode_RPartition((PyObject *)self, separator);
7009}
7010
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007011PyObject *PyUnicode_RSplit(PyObject *s,
7012 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007013 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007014{
7015 PyObject *result;
7016
7017 s = PyUnicode_FromObject(s);
7018 if (s == NULL)
7019 return NULL;
7020 if (sep != NULL) {
7021 sep = PyUnicode_FromObject(sep);
7022 if (sep == NULL) {
7023 Py_DECREF(s);
7024 return NULL;
7025 }
7026 }
7027
7028 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7029
7030 Py_DECREF(s);
7031 Py_XDECREF(sep);
7032 return result;
7033}
7034
7035PyDoc_STRVAR(rsplit__doc__,
7036"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7037\n\
7038Return a list of the words in S, using sep as the\n\
7039delimiter string, starting at the end of the string and\n\
7040working to the front. If maxsplit is given, at most maxsplit\n\
7041splits are done. If sep is not specified, any whitespace string\n\
7042is a separator.");
7043
7044static PyObject*
7045unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7046{
7047 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007048 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007049
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007051 return NULL;
7052
7053 if (substring == Py_None)
7054 return rsplit(self, NULL, maxcount);
7055 else if (PyUnicode_Check(substring))
7056 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7057 else
7058 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7059}
7060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007061PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007062"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063\n\
7064Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007065Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067
7068static PyObject*
7069unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7070{
Guido van Rossum86662912000-04-11 15:38:46 +00007071 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072
Guido van Rossum86662912000-04-11 15:38:46 +00007073 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 return NULL;
7075
Guido van Rossum86662912000-04-11 15:38:46 +00007076 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077}
7078
7079static
7080PyObject *unicode_str(PyUnicodeObject *self)
7081{
Fred Drakee4315f52000-05-09 19:53:39 +00007082 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083}
7084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007085PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086"S.swapcase() -> unicode\n\
7087\n\
7088Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007089and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090
7091static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007092unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 return fixup(self, fixswapcase);
7095}
7096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098"S.translate(table) -> unicode\n\
7099\n\
7100Return a copy of the string S, where all characters have been mapped\n\
7101through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007102Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7103Unmapped characters are left untouched. Characters mapped to None\n\
7104are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007107unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108{
Tim Petersced69f82003-09-16 20:30:58 +00007109 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007111 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 "ignore");
7113}
7114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116"S.upper() -> unicode\n\
7117\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007118Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119
7120static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007121unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 return fixup(self, fixupper);
7124}
7125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127"S.zfill(width) -> unicode\n\
7128\n\
7129Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007130of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131
7132static PyObject *
7133unicode_zfill(PyUnicodeObject *self, PyObject *args)
7134{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007135 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 PyUnicodeObject *u;
7137
Martin v. Löwis18e16552006-02-15 17:27:45 +00007138 Py_ssize_t width;
7139 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 return NULL;
7141
7142 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007143 if (PyUnicode_CheckExact(self)) {
7144 Py_INCREF(self);
7145 return (PyObject*) self;
7146 }
7147 else
7148 return PyUnicode_FromUnicode(
7149 PyUnicode_AS_UNICODE(self),
7150 PyUnicode_GET_SIZE(self)
7151 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 }
7153
7154 fill = width - self->length;
7155
7156 u = pad(self, fill, 0, '0');
7157
Walter Dörwald068325e2002-04-15 13:36:47 +00007158 if (u == NULL)
7159 return NULL;
7160
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 if (u->str[fill] == '+' || u->str[fill] == '-') {
7162 /* move sign to beginning of string */
7163 u->str[0] = u->str[fill];
7164 u->str[fill] = '0';
7165 }
7166
7167 return (PyObject*) u;
7168}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169
7170#if 0
7171static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007172unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 return PyInt_FromLong(unicode_freelist_size);
7175}
7176#endif
7177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007178PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007179"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007181Return True if S starts with the specified prefix, False otherwise.\n\
7182With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007183With optional end, stop comparing S at that position.\n\
7184prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
7186static PyObject *
7187unicode_startswith(PyUnicodeObject *self,
7188 PyObject *args)
7189{
Georg Brandl24250812006-06-09 18:45:48 +00007190 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007192 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007193 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007194 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195
Georg Brandl24250812006-06-09 18:45:48 +00007196 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007197 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007199 if (PyTuple_Check(subobj)) {
7200 Py_ssize_t i;
7201 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7202 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7203 PyTuple_GET_ITEM(subobj, i));
7204 if (substring == NULL)
7205 return NULL;
7206 result = tailmatch(self, substring, start, end, -1);
7207 Py_DECREF(substring);
7208 if (result) {
7209 Py_RETURN_TRUE;
7210 }
7211 }
7212 /* nothing matched */
7213 Py_RETURN_FALSE;
7214 }
7215 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007217 return NULL;
7218 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007220 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221}
7222
7223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007224PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007225"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007227Return True if S ends with the specified suffix, False otherwise.\n\
7228With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007229With optional end, stop comparing S at that position.\n\
7230suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231
7232static PyObject *
7233unicode_endswith(PyUnicodeObject *self,
7234 PyObject *args)
7235{
Georg Brandl24250812006-06-09 18:45:48 +00007236 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007238 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007239 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007240 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241
Georg Brandl24250812006-06-09 18:45:48 +00007242 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7243 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007245 if (PyTuple_Check(subobj)) {
7246 Py_ssize_t i;
7247 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7248 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7249 PyTuple_GET_ITEM(subobj, i));
7250 if (substring == NULL)
7251 return NULL;
7252 result = tailmatch(self, substring, start, end, +1);
7253 Py_DECREF(substring);
7254 if (result) {
7255 Py_RETURN_TRUE;
7256 }
7257 }
7258 Py_RETURN_FALSE;
7259 }
7260 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
Georg Brandl24250812006-06-09 18:45:48 +00007264 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007266 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267}
7268
7269
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007270
7271static PyObject *
7272unicode_getnewargs(PyUnicodeObject *v)
7273{
7274 return Py_BuildValue("(u#)", v->str, v->length);
7275}
7276
7277
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278static PyMethodDef unicode_methods[] = {
7279
7280 /* Order is according to common usage: often used methods should
7281 appear first, since lookup is done sequentially. */
7282
Georg Brandlecdc0a92006-03-30 12:19:07 +00007283 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007284 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7285 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007286 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007287 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7288 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7289 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7290 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7291 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7292 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7293 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007294 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007295 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7296 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7297 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007298 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007299 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007300/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7301 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7302 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7303 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007304 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007305 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007306 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007307 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007308 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7309 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7310 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7311 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7312 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7313 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7314 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7315 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7316 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7317 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7318 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7319 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7320 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7321 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007322 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007323#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007324 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325#endif
7326
7327#if 0
7328 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007329 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330#endif
7331
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007332 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 {NULL, NULL}
7334};
7335
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007336static PyObject *
7337unicode_mod(PyObject *v, PyObject *w)
7338{
7339 if (!PyUnicode_Check(v)) {
7340 Py_INCREF(Py_NotImplemented);
7341 return Py_NotImplemented;
7342 }
7343 return PyUnicode_Format(v, w);
7344}
7345
7346static PyNumberMethods unicode_as_number = {
7347 0, /*nb_add*/
7348 0, /*nb_subtract*/
7349 0, /*nb_multiply*/
7350 0, /*nb_divide*/
7351 unicode_mod, /*nb_remainder*/
7352};
7353
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007355 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007356 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7358 (ssizeargfunc) unicode_getitem, /* sq_item */
7359 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 0, /* sq_ass_item */
7361 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007362 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363};
7364
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007365static PyObject*
7366unicode_subscript(PyUnicodeObject* self, PyObject* item)
7367{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007368 if (PyIndex_Check(item)) {
7369 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007370 if (i == -1 && PyErr_Occurred())
7371 return NULL;
7372 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007373 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007374 return unicode_getitem(self, i);
7375 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007376 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007377 Py_UNICODE* source_buf;
7378 Py_UNICODE* result_buf;
7379 PyObject* result;
7380
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007381 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007382 &start, &stop, &step, &slicelength) < 0) {
7383 return NULL;
7384 }
7385
7386 if (slicelength <= 0) {
7387 return PyUnicode_FromUnicode(NULL, 0);
7388 } else {
7389 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007390 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7391 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007392
7393 if (result_buf == NULL)
7394 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007395
7396 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7397 result_buf[i] = source_buf[cur];
7398 }
Tim Petersced69f82003-09-16 20:30:58 +00007399
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007400 result = PyUnicode_FromUnicode(result_buf, slicelength);
7401 PyMem_FREE(result_buf);
7402 return result;
7403 }
7404 } else {
7405 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7406 return NULL;
7407 }
7408}
7409
7410static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007411 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007412 (binaryfunc)unicode_subscript, /* mp_subscript */
7413 (objobjargproc)0, /* mp_ass_subscript */
7414};
7415
Martin v. Löwis18e16552006-02-15 17:27:45 +00007416static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007418 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 const void **ptr)
7420{
7421 if (index != 0) {
7422 PyErr_SetString(PyExc_SystemError,
7423 "accessing non-existent unicode segment");
7424 return -1;
7425 }
7426 *ptr = (void *) self->str;
7427 return PyUnicode_GET_DATA_SIZE(self);
7428}
7429
Martin v. Löwis18e16552006-02-15 17:27:45 +00007430static Py_ssize_t
7431unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 const void **ptr)
7433{
7434 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007435 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 return -1;
7437}
7438
7439static int
7440unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442{
7443 if (lenp)
7444 *lenp = PyUnicode_GET_DATA_SIZE(self);
7445 return 1;
7446}
7447
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007448static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007450 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 const void **ptr)
7452{
7453 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007454
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 if (index != 0) {
7456 PyErr_SetString(PyExc_SystemError,
7457 "accessing non-existent unicode segment");
7458 return -1;
7459 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007460 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 if (str == NULL)
7462 return -1;
7463 *ptr = (void *) PyString_AS_STRING(str);
7464 return PyString_GET_SIZE(str);
7465}
7466
7467/* Helpers for PyUnicode_Format() */
7468
7469static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007470getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007472 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 if (argidx < arglen) {
7474 (*p_argidx)++;
7475 if (arglen < 0)
7476 return args;
7477 else
7478 return PyTuple_GetItem(args, argidx);
7479 }
7480 PyErr_SetString(PyExc_TypeError,
7481 "not enough arguments for format string");
7482 return NULL;
7483}
7484
7485#define F_LJUST (1<<0)
7486#define F_SIGN (1<<1)
7487#define F_BLANK (1<<2)
7488#define F_ALT (1<<3)
7489#define F_ZERO (1<<4)
7490
Martin v. Löwis18e16552006-02-15 17:27:45 +00007491static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007492strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007494 register Py_ssize_t i;
7495 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 for (i = len - 1; i >= 0; i--)
7497 buffer[i] = (Py_UNICODE) charbuffer[i];
7498
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 return len;
7500}
7501
Neal Norwitzfc76d632006-01-10 06:03:13 +00007502static int
7503doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7504{
Tim Peters15231542006-02-16 01:08:01 +00007505 Py_ssize_t result;
7506
Neal Norwitzfc76d632006-01-10 06:03:13 +00007507 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007508 result = strtounicode(buffer, (char *)buffer);
7509 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007510}
7511
7512static int
7513longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7514{
Tim Peters15231542006-02-16 01:08:01 +00007515 Py_ssize_t result;
7516
Neal Norwitzfc76d632006-01-10 06:03:13 +00007517 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007518 result = strtounicode(buffer, (char *)buffer);
7519 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007520}
7521
Guido van Rossum078151d2002-08-11 04:24:12 +00007522/* XXX To save some code duplication, formatfloat/long/int could have been
7523 shared with stringobject.c, converting from 8-bit to Unicode after the
7524 formatting is done. */
7525
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526static int
7527formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007528 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 int flags,
7530 int prec,
7531 int type,
7532 PyObject *v)
7533{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007534 /* fmt = '%#.' + `prec` + `type`
7535 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 char fmt[20];
7537 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007538
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 x = PyFloat_AsDouble(v);
7540 if (x == -1.0 && PyErr_Occurred())
7541 return -1;
7542 if (prec < 0)
7543 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7545 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007546 /* Worst case length calc to ensure no buffer overrun:
7547
7548 'g' formats:
7549 fmt = %#.<prec>g
7550 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7551 for any double rep.)
7552 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7553
7554 'f' formats:
7555 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7556 len = 1 + 50 + 1 + prec = 52 + prec
7557
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007558 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007559 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007560
7561 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00007562 if (((type == 'g' || type == 'G') &&
7563 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007564 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007565 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007566 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007567 return -1;
7568 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007569 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7570 (flags&F_ALT) ? "#" : "",
7571 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007572 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573}
7574
Tim Peters38fd5b62000-09-21 05:43:11 +00007575static PyObject*
7576formatlong(PyObject *val, int flags, int prec, int type)
7577{
7578 char *buf;
7579 int i, len;
7580 PyObject *str; /* temporary string object. */
7581 PyUnicodeObject *result;
7582
7583 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7584 if (!str)
7585 return NULL;
7586 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007587 if (!result) {
7588 Py_DECREF(str);
7589 return NULL;
7590 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007591 for (i = 0; i < len; i++)
7592 result->str[i] = buf[i];
7593 result->str[len] = 0;
7594 Py_DECREF(str);
7595 return (PyObject*)result;
7596}
7597
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598static int
7599formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007600 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 int flags,
7602 int prec,
7603 int type,
7604 PyObject *v)
7605{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007606 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007607 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7608 * + 1 + 1
7609 * = 24
7610 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007611 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007612 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 long x;
7614
7615 x = PyInt_AsLong(v);
7616 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007617 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007618 if (x < 0 && type == 'u') {
7619 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007620 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007621 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7622 sign = "-";
7623 else
7624 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007626 prec = 1;
7627
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007628 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7629 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007630 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007631 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007632 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007633 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007634 return -1;
7635 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007636
7637 if ((flags & F_ALT) &&
7638 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007639 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007640 * of issues that cause pain:
7641 * - when 0 is being converted, the C standard leaves off
7642 * the '0x' or '0X', which is inconsistent with other
7643 * %#x/%#X conversions and inconsistent with Python's
7644 * hex() function
7645 * - there are platforms that violate the standard and
7646 * convert 0 with the '0x' or '0X'
7647 * (Metrowerks, Compaq Tru64)
7648 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007649 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007650 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007651 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007652 * We can achieve the desired consistency by inserting our
7653 * own '0x' or '0X' prefix, and substituting %x/%X in place
7654 * of %#x/%#X.
7655 *
7656 * Note that this is the same approach as used in
7657 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007658 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007659 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7660 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007661 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007662 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007663 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7664 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007665 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007666 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007667 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007668 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007669 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007670 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671}
7672
7673static int
7674formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007675 size_t buflen,
7676 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007678 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007679 if (PyUnicode_Check(v)) {
7680 if (PyUnicode_GET_SIZE(v) != 1)
7681 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007685 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007686 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007687 goto onError;
7688 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
7691 else {
7692 /* Integer input truncated to a character */
7693 long x;
7694 x = PyInt_AsLong(v);
7695 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007696 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007697#ifdef Py_UNICODE_WIDE
7698 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007699 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007700 "%c arg not in range(0x110000) "
7701 "(wide Python build)");
7702 return -1;
7703 }
7704#else
7705 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007706 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007707 "%c arg not in range(0x10000) "
7708 "(narrow Python build)");
7709 return -1;
7710 }
7711#endif
7712 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 }
7714 buf[1] = '\0';
7715 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007716
7717 onError:
7718 PyErr_SetString(PyExc_TypeError,
7719 "%c requires int or char");
7720 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721}
7722
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007723/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7724
7725 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7726 chars are formatted. XXX This is a magic number. Each formatting
7727 routine does bounds checking to ensure no overflow, but a better
7728 solution may be to malloc a buffer of appropriate size for each
7729 format. For now, the current solution is sufficient.
7730*/
7731#define FORMATBUFLEN (size_t)120
7732
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733PyObject *PyUnicode_Format(PyObject *format,
7734 PyObject *args)
7735{
7736 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007737 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 int args_owned = 0;
7739 PyUnicodeObject *result = NULL;
7740 PyObject *dict = NULL;
7741 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007742
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 if (format == NULL || args == NULL) {
7744 PyErr_BadInternalCall();
7745 return NULL;
7746 }
7747 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007748 if (uformat == NULL)
7749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 fmt = PyUnicode_AS_UNICODE(uformat);
7751 fmtcnt = PyUnicode_GET_SIZE(uformat);
7752
7753 reslen = rescnt = fmtcnt + 100;
7754 result = _PyUnicode_New(reslen);
7755 if (result == NULL)
7756 goto onError;
7757 res = PyUnicode_AS_UNICODE(result);
7758
7759 if (PyTuple_Check(args)) {
7760 arglen = PyTuple_Size(args);
7761 argidx = 0;
7762 }
7763 else {
7764 arglen = -1;
7765 argidx = -2;
7766 }
Martin v. Löwis68192102007-07-21 06:55:02 +00007767 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007768 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 dict = args;
7770
7771 while (--fmtcnt >= 0) {
7772 if (*fmt != '%') {
7773 if (--rescnt < 0) {
7774 rescnt = fmtcnt + 100;
7775 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007776 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007777 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7779 --rescnt;
7780 }
7781 *res++ = *fmt++;
7782 }
7783 else {
7784 /* Got a format specifier */
7785 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007786 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788 Py_UNICODE c = '\0';
7789 Py_UNICODE fill;
7790 PyObject *v = NULL;
7791 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007792 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007794 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007795 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
7797 fmt++;
7798 if (*fmt == '(') {
7799 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007800 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 PyObject *key;
7802 int pcount = 1;
7803
7804 if (dict == NULL) {
7805 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007806 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 goto onError;
7808 }
7809 ++fmt;
7810 --fmtcnt;
7811 keystart = fmt;
7812 /* Skip over balanced parentheses */
7813 while (pcount > 0 && --fmtcnt >= 0) {
7814 if (*fmt == ')')
7815 --pcount;
7816 else if (*fmt == '(')
7817 ++pcount;
7818 fmt++;
7819 }
7820 keylen = fmt - keystart - 1;
7821 if (fmtcnt < 0 || pcount > 0) {
7822 PyErr_SetString(PyExc_ValueError,
7823 "incomplete format key");
7824 goto onError;
7825 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007826#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007827 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 then looked up since Python uses strings to hold
7829 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007830 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 key = PyUnicode_EncodeUTF8(keystart,
7832 keylen,
7833 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007834#else
7835 key = PyUnicode_FromUnicode(keystart, keylen);
7836#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 if (key == NULL)
7838 goto onError;
7839 if (args_owned) {
7840 Py_DECREF(args);
7841 args_owned = 0;
7842 }
7843 args = PyObject_GetItem(dict, key);
7844 Py_DECREF(key);
7845 if (args == NULL) {
7846 goto onError;
7847 }
7848 args_owned = 1;
7849 arglen = -1;
7850 argidx = -2;
7851 }
7852 while (--fmtcnt >= 0) {
7853 switch (c = *fmt++) {
7854 case '-': flags |= F_LJUST; continue;
7855 case '+': flags |= F_SIGN; continue;
7856 case ' ': flags |= F_BLANK; continue;
7857 case '#': flags |= F_ALT; continue;
7858 case '0': flags |= F_ZERO; continue;
7859 }
7860 break;
7861 }
7862 if (c == '*') {
7863 v = getnextarg(args, arglen, &argidx);
7864 if (v == NULL)
7865 goto onError;
7866 if (!PyInt_Check(v)) {
7867 PyErr_SetString(PyExc_TypeError,
7868 "* wants int");
7869 goto onError;
7870 }
7871 width = PyInt_AsLong(v);
7872 if (width < 0) {
7873 flags |= F_LJUST;
7874 width = -width;
7875 }
7876 if (--fmtcnt >= 0)
7877 c = *fmt++;
7878 }
7879 else if (c >= '0' && c <= '9') {
7880 width = c - '0';
7881 while (--fmtcnt >= 0) {
7882 c = *fmt++;
7883 if (c < '0' || c > '9')
7884 break;
7885 if ((width*10) / 10 != width) {
7886 PyErr_SetString(PyExc_ValueError,
7887 "width too big");
7888 goto onError;
7889 }
7890 width = width*10 + (c - '0');
7891 }
7892 }
7893 if (c == '.') {
7894 prec = 0;
7895 if (--fmtcnt >= 0)
7896 c = *fmt++;
7897 if (c == '*') {
7898 v = getnextarg(args, arglen, &argidx);
7899 if (v == NULL)
7900 goto onError;
7901 if (!PyInt_Check(v)) {
7902 PyErr_SetString(PyExc_TypeError,
7903 "* wants int");
7904 goto onError;
7905 }
7906 prec = PyInt_AsLong(v);
7907 if (prec < 0)
7908 prec = 0;
7909 if (--fmtcnt >= 0)
7910 c = *fmt++;
7911 }
7912 else if (c >= '0' && c <= '9') {
7913 prec = c - '0';
7914 while (--fmtcnt >= 0) {
7915 c = Py_CHARMASK(*fmt++);
7916 if (c < '0' || c > '9')
7917 break;
7918 if ((prec*10) / 10 != prec) {
7919 PyErr_SetString(PyExc_ValueError,
7920 "prec too big");
7921 goto onError;
7922 }
7923 prec = prec*10 + (c - '0');
7924 }
7925 }
7926 } /* prec */
7927 if (fmtcnt >= 0) {
7928 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 if (--fmtcnt >= 0)
7930 c = *fmt++;
7931 }
7932 }
7933 if (fmtcnt < 0) {
7934 PyErr_SetString(PyExc_ValueError,
7935 "incomplete format");
7936 goto onError;
7937 }
7938 if (c != '%') {
7939 v = getnextarg(args, arglen, &argidx);
7940 if (v == NULL)
7941 goto onError;
7942 }
7943 sign = 0;
7944 fill = ' ';
7945 switch (c) {
7946
7947 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007948 pbuf = formatbuf;
7949 /* presume that buffer length is at least 1 */
7950 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951 len = 1;
7952 break;
7953
7954 case 's':
7955 case 'r':
7956 if (PyUnicode_Check(v) && c == 's') {
7957 temp = v;
7958 Py_INCREF(temp);
7959 }
7960 else {
7961 PyObject *unicode;
7962 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007963 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 else
7965 temp = PyObject_Repr(v);
7966 if (temp == NULL)
7967 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007968 if (PyUnicode_Check(temp))
7969 /* nothing to do */;
7970 else if (PyString_Check(temp)) {
7971 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007972 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007974 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007976 Py_DECREF(temp);
7977 temp = unicode;
7978 if (temp == NULL)
7979 goto onError;
7980 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007981 else {
7982 Py_DECREF(temp);
7983 PyErr_SetString(PyExc_TypeError,
7984 "%s argument has non-string str()");
7985 goto onError;
7986 }
7987 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007988 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 len = PyUnicode_GET_SIZE(temp);
7990 if (prec >= 0 && len > prec)
7991 len = prec;
7992 break;
7993
7994 case 'i':
7995 case 'd':
7996 case 'u':
7997 case 'o':
7998 case 'x':
7999 case 'X':
8000 if (c == 'i')
8001 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008002 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008003 temp = formatlong(v, flags, prec, c);
8004 if (!temp)
8005 goto onError;
8006 pbuf = PyUnicode_AS_UNICODE(temp);
8007 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008008 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008010 else {
8011 pbuf = formatbuf;
8012 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8013 flags, prec, c, v);
8014 if (len < 0)
8015 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008016 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008017 }
8018 if (flags & F_ZERO)
8019 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 break;
8021
8022 case 'e':
8023 case 'E':
8024 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008025 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 case 'g':
8027 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008028 if (c == 'F')
8029 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008030 pbuf = formatbuf;
8031 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8032 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 if (len < 0)
8034 goto onError;
8035 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008036 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 fill = '0';
8038 break;
8039
8040 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008041 pbuf = formatbuf;
8042 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 if (len < 0)
8044 goto onError;
8045 break;
8046
8047 default:
8048 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008049 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008050 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008051 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008052 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008053 (Py_ssize_t)(fmt - 1 -
8054 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 goto onError;
8056 }
8057 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008058 if (*pbuf == '-' || *pbuf == '+') {
8059 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 len--;
8061 }
8062 else if (flags & F_SIGN)
8063 sign = '+';
8064 else if (flags & F_BLANK)
8065 sign = ' ';
8066 else
8067 sign = 0;
8068 }
8069 if (width < len)
8070 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008071 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 reslen -= rescnt;
8073 rescnt = width + fmtcnt + 100;
8074 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008075 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008076 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008077 PyErr_NoMemory();
8078 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008079 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008080 if (_PyUnicode_Resize(&result, reslen) < 0) {
8081 Py_XDECREF(temp);
8082 goto onError;
8083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 res = PyUnicode_AS_UNICODE(result)
8085 + reslen - rescnt;
8086 }
8087 if (sign) {
8088 if (fill != ' ')
8089 *res++ = sign;
8090 rescnt--;
8091 if (width > len)
8092 width--;
8093 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008094 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8095 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008096 assert(pbuf[1] == c);
8097 if (fill != ' ') {
8098 *res++ = *pbuf++;
8099 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008100 }
Tim Petersfff53252001-04-12 18:38:48 +00008101 rescnt -= 2;
8102 width -= 2;
8103 if (width < 0)
8104 width = 0;
8105 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 if (width > len && !(flags & F_LJUST)) {
8108 do {
8109 --rescnt;
8110 *res++ = fill;
8111 } while (--width > len);
8112 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008113 if (fill == ' ') {
8114 if (sign)
8115 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008116 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008117 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008118 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008119 *res++ = *pbuf++;
8120 *res++ = *pbuf++;
8121 }
8122 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008123 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 res += len;
8125 rescnt -= len;
8126 while (--width >= len) {
8127 --rescnt;
8128 *res++ = ' ';
8129 }
8130 if (dict && (argidx < arglen) && c != '%') {
8131 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008132 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008133 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 goto onError;
8135 }
8136 Py_XDECREF(temp);
8137 } /* '%' */
8138 } /* until end */
8139 if (argidx < arglen && !dict) {
8140 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008141 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 goto onError;
8143 }
8144
Thomas Woutersa96affe2006-03-12 00:29:36 +00008145 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8146 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 if (args_owned) {
8148 Py_DECREF(args);
8149 }
8150 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 return (PyObject *)result;
8152
8153 onError:
8154 Py_XDECREF(result);
8155 Py_DECREF(uformat);
8156 if (args_owned) {
8157 Py_DECREF(args);
8158 }
8159 return NULL;
8160}
8161
8162static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008163 (readbufferproc) unicode_buffer_getreadbuf,
8164 (writebufferproc) unicode_buffer_getwritebuf,
8165 (segcountproc) unicode_buffer_getsegcount,
8166 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167};
8168
Jeremy Hylton938ace62002-07-17 16:30:39 +00008169static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008170unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8171
Tim Peters6d6c1a32001-08-02 04:15:00 +00008172static PyObject *
8173unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8174{
8175 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008176 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008177 char *encoding = NULL;
8178 char *errors = NULL;
8179
Guido van Rossume023fe02001-08-30 03:12:59 +00008180 if (type != &PyUnicode_Type)
8181 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008182 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8183 kwlist, &x, &encoding, &errors))
8184 return NULL;
8185 if (x == NULL)
8186 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008187 if (encoding == NULL && errors == NULL)
8188 return PyObject_Unicode(x);
8189 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008190 return PyUnicode_FromEncodedObject(x, encoding, errors);
8191}
8192
Guido van Rossume023fe02001-08-30 03:12:59 +00008193static PyObject *
8194unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8195{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008196 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008197 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008198
8199 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8200 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8201 if (tmp == NULL)
8202 return NULL;
8203 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008204 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008205 if (pnew == NULL) {
8206 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008207 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008208 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008209 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8210 if (pnew->str == NULL) {
8211 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008212 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008213 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008214 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008215 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008216 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8217 pnew->length = n;
8218 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008219 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008220 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008221}
8222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008223PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008224"unicode(string [, encoding[, errors]]) -> object\n\
8225\n\
8226Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008227encoding defaults to the current default string encoding.\n\
8228errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008229
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008231 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 "unicode", /* tp_name */
8233 sizeof(PyUnicodeObject), /* tp_size */
8234 0, /* tp_itemsize */
8235 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008236 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008238 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008240 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008241 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008242 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008244 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 (hashfunc) unicode_hash, /* tp_hash*/
8246 0, /* tp_call*/
8247 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008248 PyObject_GenericGetAttr, /* tp_getattro */
8249 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008251 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008252 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008253 unicode_doc, /* tp_doc */
8254 0, /* tp_traverse */
8255 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008256 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008257 0, /* tp_weaklistoffset */
8258 0, /* tp_iter */
8259 0, /* tp_iternext */
8260 unicode_methods, /* tp_methods */
8261 0, /* tp_members */
8262 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008263 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008264 0, /* tp_dict */
8265 0, /* tp_descr_get */
8266 0, /* tp_descr_set */
8267 0, /* tp_dictoffset */
8268 0, /* tp_init */
8269 0, /* tp_alloc */
8270 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008271 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272};
8273
8274/* Initialize the Unicode implementation */
8275
Thomas Wouters78890102000-07-22 19:25:51 +00008276void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008278 int i;
8279
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008280 /* XXX - move this array to unicodectype.c ? */
8281 Py_UNICODE linebreak[] = {
8282 0x000A, /* LINE FEED */
8283 0x000D, /* CARRIAGE RETURN */
8284 0x001C, /* FILE SEPARATOR */
8285 0x001D, /* GROUP SEPARATOR */
8286 0x001E, /* RECORD SEPARATOR */
8287 0x0085, /* NEXT LINE */
8288 0x2028, /* LINE SEPARATOR */
8289 0x2029, /* PARAGRAPH SEPARATOR */
8290 };
8291
Fred Drakee4315f52000-05-09 19:53:39 +00008292 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008293 unicode_freelist = NULL;
8294 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008296 if (!unicode_empty)
8297 return;
8298
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008299 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008300 for (i = 0; i < 256; i++)
8301 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008302 if (PyType_Ready(&PyUnicode_Type) < 0)
8303 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008304
8305 /* initialize the linebreak bloom filter */
8306 bloom_linebreak = make_bloom_mask(
8307 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8308 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008309
8310 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311}
8312
8313/* Finalize the Unicode implementation */
8314
8315void
Thomas Wouters78890102000-07-22 19:25:51 +00008316_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008318 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008319 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008321 Py_XDECREF(unicode_empty);
8322 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008323
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008324 for (i = 0; i < 256; i++) {
8325 if (unicode_latin1[i]) {
8326 Py_DECREF(unicode_latin1[i]);
8327 unicode_latin1[i] = NULL;
8328 }
8329 }
8330
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008331 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 PyUnicodeObject *v = u;
8333 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008334 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008335 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008336 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008337 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008339 unicode_freelist = NULL;
8340 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008342
Anthony Baxterac6bd462006-04-13 02:06:09 +00008343#ifdef __cplusplus
8344}
8345#endif
8346
8347
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008348/*
8349Local variables:
8350c-basic-offset: 4
8351indent-tabs-mode: nil
8352End:
8353*/