blob: 26aa7533bab43440c2cbf028e00f5e9d09f21f16 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Martin v. Löwis68192102007-07-21 06:55:02 +0000315 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis68192102007-07-21 06:55:02 +0000329 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +0000557 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000607 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000717 Py_Type(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
799 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000800 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000802 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000803
804 PyObject *restuple = NULL;
805 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
807 Py_ssize_t requiredsize;
808 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811 int res = -1;
812
813 if (*errorHandler == NULL) {
814 *errorHandler = PyCodec_LookupError(errors);
815 if (*errorHandler == NULL)
816 goto onError;
817 }
818
819 if (*exceptionObject == NULL) {
820 *exceptionObject = PyUnicodeDecodeError_Create(
821 encoding, input, insize, *startinpos, *endinpos, reason);
822 if (*exceptionObject == NULL)
823 goto onError;
824 }
825 else {
826 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
827 goto onError;
828 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
829 goto onError;
830 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
831 goto onError;
832 }
833
834 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
835 if (restuple == NULL)
836 goto onError;
837 if (!PyTuple_Check(restuple)) {
838 PyErr_Format(PyExc_TypeError, &argparse[4]);
839 goto onError;
840 }
841 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
842 goto onError;
843 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000844 newpos = insize+newpos;
845 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000846 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000847 goto onError;
848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849
850 /* need more space? (at least enough for what we
851 have+the replacement+the rest of the string (starting
852 at the new input position), so we won't have to check space
853 when there are no errors in the rest of the string) */
854 repptr = PyUnicode_AS_UNICODE(repunicode);
855 repsize = PyUnicode_GET_SIZE(repunicode);
856 requiredsize = *outpos + repsize + insize-newpos;
857 if (requiredsize > outsize) {
858 if (requiredsize<2*outsize)
859 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000860 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861 goto onError;
862 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
863 }
864 *endinpos = newpos;
865 *inptr = input + newpos;
866 Py_UNICODE_COPY(*outptr, repptr, repsize);
867 *outptr += repsize;
868 *outpos += repsize;
869 /* we made it! */
870 res = 0;
871
872 onError:
873 Py_XDECREF(restuple);
874 return res;
875}
876
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000877/* --- UTF-7 Codec -------------------------------------------------------- */
878
879/* see RFC2152 for details */
880
Tim Petersced69f82003-09-16 20:30:58 +0000881static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000882char utf7_special[128] = {
883 /* indicate whether a UTF-7 character is special i.e. cannot be directly
884 encoded:
885 0 - not special
886 1 - special
887 2 - whitespace (optional)
888 3 - RFC2152 Set O (optional) */
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
891 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
893 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
895 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
897
898};
899
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000900/* Note: The comparison (c) <= 0 is a trick to work-around gcc
901 warnings about the comparison always being false; since
902 utf7_special[0] is 1, we can safely make that one comparison
903 true */
904
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000905#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000906 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000907 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 (encodeO && (utf7_special[(c)] == 3)))
909
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910#define B64(n) \
911 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
912#define B64CHAR(c) \
913 (isalnum(c) || (c) == '+' || (c) == '/')
914#define UB64(c) \
915 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
916 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000917
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000918#define ENCODE(out, ch, bits) \
919 while (bits >= 6) { \
920 *out++ = B64(ch >> (bits-6)); \
921 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000922 }
923
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000924#define DECODE(out, ch, bits, surrogate) \
925 while (bits >= 16) { \
926 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
927 bits -= 16; \
928 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000929 /* We have already generated an error for the high surrogate \
930 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000931 surrogate = 0; \
932 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000934 it in a 16-bit character */ \
935 surrogate = 1; \
936 errmsg = "code pairs are not supported"; \
937 goto utf7Error; \
938 } else { \
939 *out++ = outCh; \
940 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000941 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000943PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000944 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 const char *errors)
946{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000947 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t startinpos;
949 Py_ssize_t endinpos;
950 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 const char *e;
952 PyUnicodeObject *unicode;
953 Py_UNICODE *p;
954 const char *errmsg = "";
955 int inShift = 0;
956 unsigned int bitsleft = 0;
957 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000958 int surrogate = 0;
959 PyObject *errorHandler = NULL;
960 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000961
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
967
968 p = unicode->str;
969 e = s + size;
970
971 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_UNICODE ch;
973 restart:
974 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000975
976 if (inShift) {
977 if ((ch == '-') || !B64CHAR(ch)) {
978 inShift = 0;
979 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000980
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
982 if (bitsleft >= 6) {
983 /* The shift sequence has a partial character in it. If
984 bitsleft < 6 then we could just classify it as padding
985 but that is not the case here */
986
987 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000988 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000989 }
990 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000991 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992 here so indicate the potential of a misencoded character. */
993
994 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
995 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
996 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000997 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000998 }
999
1000 if (ch == '-') {
1001 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001002 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003 inShift = 1;
1004 }
1005 } else if (SPECIAL(ch,0,0)) {
1006 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001007 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 } else {
1009 *p++ = ch;
1010 }
1011 } else {
1012 charsleft = (charsleft << 6) | UB64(ch);
1013 bitsleft += 6;
1014 s++;
1015 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1016 }
1017 }
1018 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001019 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 s++;
1021 if (s < e && *s == '-') {
1022 s++;
1023 *p++ = '+';
1024 } else
1025 {
1026 inShift = 1;
1027 bitsleft = 0;
1028 }
1029 }
1030 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001031 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001032 errmsg = "unexpected special character";
1033 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001034 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001035 }
1036 else {
1037 *p++ = ch;
1038 s++;
1039 }
1040 continue;
1041 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001042 outpos = p-PyUnicode_AS_UNICODE(unicode);
1043 endinpos = s-starts;
1044 if (unicode_decode_call_errorhandler(
1045 errors, &errorHandler,
1046 "utf7", errmsg,
1047 starts, size, &startinpos, &endinpos, &exc, &s,
1048 (PyObject **)&unicode, &outpos, &p))
1049 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001050 }
1051
1052 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001053 outpos = p-PyUnicode_AS_UNICODE(unicode);
1054 endinpos = size;
1055 if (unicode_decode_call_errorhandler(
1056 errors, &errorHandler,
1057 "utf7", "unterminated shift sequence",
1058 starts, size, &startinpos, &endinpos, &exc, &s,
1059 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001061 if (s < e)
1062 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 }
1064
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001065 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 goto onError;
1067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001068 Py_XDECREF(errorHandler);
1069 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001070 return (PyObject *)unicode;
1071
1072onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001073 Py_XDECREF(errorHandler);
1074 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001075 Py_DECREF(unicode);
1076 return NULL;
1077}
1078
1079
1080PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001081 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001082 int encodeSetO,
1083 int encodeWhiteSpace,
1084 const char *errors)
1085{
1086 PyObject *v;
1087 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001090 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001091 unsigned int bitsleft = 0;
1092 unsigned long charsleft = 0;
1093 char * out;
1094 char * start;
1095
1096 if (size == 0)
1097 return PyString_FromStringAndSize(NULL, 0);
1098
1099 v = PyString_FromStringAndSize(NULL, cbAllocated);
1100 if (v == NULL)
1101 return NULL;
1102
1103 start = out = PyString_AS_STRING(v);
1104 for (;i < size; ++i) {
1105 Py_UNICODE ch = s[i];
1106
1107 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001108 if (ch == '+') {
1109 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001110 *out++ = '-';
1111 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1112 charsleft = ch;
1113 bitsleft = 16;
1114 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001116 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001117 } else {
1118 *out++ = (char) ch;
1119 }
1120 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001121 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1122 *out++ = B64(charsleft << (6-bitsleft));
1123 charsleft = 0;
1124 bitsleft = 0;
1125 /* Characters not in the BASE64 set implicitly unshift the sequence
1126 so no '-' is required, except if the character is itself a '-' */
1127 if (B64CHAR(ch) || ch == '-') {
1128 *out++ = '-';
1129 }
1130 inShift = 0;
1131 *out++ = (char) ch;
1132 } else {
1133 bitsleft += 16;
1134 charsleft = (charsleft << 16) | ch;
1135 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1136
1137 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001138 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001139 or '-' then the shift sequence will be terminated implicitly and we
1140 don't have to insert a '-'. */
1141
1142 if (bitsleft == 0) {
1143 if (i + 1 < size) {
1144 Py_UNICODE ch2 = s[i+1];
1145
1146 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001147
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001148 } else if (B64CHAR(ch2) || ch2 == '-') {
1149 *out++ = '-';
1150 inShift = 0;
1151 } else {
1152 inShift = 0;
1153 }
1154
1155 }
1156 else {
1157 *out++ = '-';
1158 inShift = 0;
1159 }
1160 }
Tim Petersced69f82003-09-16 20:30:58 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001163 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001164 if (bitsleft) {
1165 *out++= B64(charsleft << (6-bitsleft) );
1166 *out++ = '-';
1167 }
1168
Tim Peters5de98422002-04-27 18:44:32 +00001169 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001170 return v;
1171}
1172
1173#undef SPECIAL
1174#undef B64
1175#undef B64CHAR
1176#undef UB64
1177#undef ENCODE
1178#undef DECODE
1179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180/* --- UTF-8 Codec -------------------------------------------------------- */
1181
Tim Petersced69f82003-09-16 20:30:58 +00001182static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183char utf8_code_length[256] = {
1184 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1185 illegal prefix. see RFC 2279 for details */
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1199 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1200 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1201 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1202};
1203
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001205 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 const char *errors)
1207{
Walter Dörwald69652032004-09-07 20:24:22 +00001208 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1209}
1210
1211PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001213 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001214 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001216 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001218 Py_ssize_t startinpos;
1219 Py_ssize_t endinpos;
1220 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 const char *e;
1222 PyUnicodeObject *unicode;
1223 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 PyObject *errorHandler = NULL;
1226 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227
1228 /* Note: size will always be longer than the resulting Unicode
1229 character count */
1230 unicode = _PyUnicode_New(size);
1231 if (!unicode)
1232 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001233 if (size == 0) {
1234 if (consumed)
1235 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238
1239 /* Unpack UTF-8 encoded data */
1240 p = unicode->str;
1241 e = s + size;
1242
1243 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001244 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245
1246 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 s++;
1249 continue;
1250 }
1251
1252 n = utf8_code_length[ch];
1253
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001254 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001255 if (consumed)
1256 break;
1257 else {
1258 errmsg = "unexpected end of data";
1259 startinpos = s-starts;
1260 endinpos = size;
1261 goto utf8Error;
1262 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264
1265 switch (n) {
1266
1267 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001268 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 startinpos = s-starts;
1270 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272
1273 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001275 startinpos = s-starts;
1276 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278
1279 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 if ((s[1] & 0xc0) != 0x80) {
1281 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 startinpos = s-starts;
1283 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001284 goto utf8Error;
1285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288 startinpos = s-starts;
1289 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 errmsg = "illegal encoding";
1291 goto utf8Error;
1292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001294 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295 break;
1296
1297 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001298 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001299 (s[2] & 0xc0) != 0x80) {
1300 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 startinpos = s-starts;
1302 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001303 goto utf8Error;
1304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001306 if (ch < 0x0800) {
1307 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001308 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001309
1310 XXX For wide builds (UCS-4) we should probably try
1311 to recombine the surrogates into a single code
1312 unit.
1313 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001315 startinpos = s-starts;
1316 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 goto utf8Error;
1318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001320 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001321 break;
1322
1323 case 4:
1324 if ((s[1] & 0xc0) != 0x80 ||
1325 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001326 (s[3] & 0xc0) != 0x80) {
1327 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001328 startinpos = s-starts;
1329 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001330 goto utf8Error;
1331 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001332 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1333 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1334 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001338 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001339 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001340 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 startinpos = s-starts;
1342 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 goto utf8Error;
1344 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001345#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001346 *p++ = (Py_UNICODE)ch;
1347#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001349
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001350 /* translate from 10000..10FFFF to 0..FFFF */
1351 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001353 /* high surrogate = top 10 bits added to D800 */
1354 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001357 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001358#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 break;
1360
1361 default:
1362 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001363 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001364 startinpos = s-starts;
1365 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001366 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 }
1368 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001370
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001371 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001372 outpos = p-PyUnicode_AS_UNICODE(unicode);
1373 if (unicode_decode_call_errorhandler(
1374 errors, &errorHandler,
1375 "utf8", errmsg,
1376 starts, size, &startinpos, &endinpos, &exc, &s,
1377 (PyObject **)&unicode, &outpos, &p))
1378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 }
Walter Dörwald69652032004-09-07 20:24:22 +00001380 if (consumed)
1381 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382
1383 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001384 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 goto onError;
1386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 Py_XDECREF(errorHandler);
1388 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 return (PyObject *)unicode;
1390
1391onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001392 Py_XDECREF(errorHandler);
1393 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 Py_DECREF(unicode);
1395 return NULL;
1396}
1397
Tim Peters602f7402002-04-27 18:03:26 +00001398/* Allocation strategy: if the string is short, convert into a stack buffer
1399 and allocate exactly as much space needed at the end. Else allocate the
1400 maximum possible needed (4 result bytes per Unicode character), and return
1401 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001402*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001403PyObject *
1404PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001405 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001406 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407{
Tim Peters602f7402002-04-27 18:03:26 +00001408#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001409
Martin v. Löwis18e16552006-02-15 17:27:45 +00001410 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001411 PyObject *v; /* result string object */
1412 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001414 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001415 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001416
Tim Peters602f7402002-04-27 18:03:26 +00001417 assert(s != NULL);
1418 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 if (size <= MAX_SHORT_UNICHARS) {
1421 /* Write into the stack buffer; nallocated can't overflow.
1422 * At the end, we'll allocate exactly as much heap space as it
1423 * turns out we need.
1424 */
1425 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1426 v = NULL; /* will allocate after we're done */
1427 p = stackbuf;
1428 }
1429 else {
1430 /* Overallocate on the heap, and give the excess back at the end. */
1431 nallocated = size * 4;
1432 if (nallocated / 4 != size) /* overflow! */
1433 return PyErr_NoMemory();
1434 v = PyString_FromStringAndSize(NULL, nallocated);
1435 if (v == NULL)
1436 return NULL;
1437 p = PyString_AS_STRING(v);
1438 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001439
Tim Peters602f7402002-04-27 18:03:26 +00001440 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001441 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001442
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001443 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001444 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001446
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001448 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001449 *p++ = (char)(0xc0 | (ch >> 6));
1450 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001451 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001452 else {
Tim Peters602f7402002-04-27 18:03:26 +00001453 /* Encode UCS2 Unicode ordinals */
1454 if (ch < 0x10000) {
1455 /* Special case: check for high surrogate */
1456 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1457 Py_UCS4 ch2 = s[i];
1458 /* Check for low surrogate and combine the two to
1459 form a UCS4 value */
1460 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001461 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001462 i++;
1463 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Tim Peters602f7402002-04-27 18:03:26 +00001465 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001466 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001467 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001468 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1469 *p++ = (char)(0x80 | (ch & 0x3f));
1470 continue;
1471 }
1472encodeUCS4:
1473 /* Encode UCS4 Unicode ordinals */
1474 *p++ = (char)(0xf0 | (ch >> 18));
1475 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1476 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1477 *p++ = (char)(0x80 | (ch & 0x3f));
1478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001480
Tim Peters602f7402002-04-27 18:03:26 +00001481 if (v == NULL) {
1482 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001483 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001484 assert(nneeded <= nallocated);
1485 v = PyString_FromStringAndSize(stackbuf, nneeded);
1486 }
1487 else {
1488 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001489 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001490 assert(nneeded <= nallocated);
1491 _PyString_Resize(&v, nneeded);
1492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001494
Tim Peters602f7402002-04-27 18:03:26 +00001495#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496}
1497
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1499{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 if (!PyUnicode_Check(unicode)) {
1501 PyErr_BadArgument();
1502 return NULL;
1503 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
Walter Dörwald6e390802007-08-17 16:41:28 +00001509/* --- UTF-32 Codec ------------------------------------------------------- */
1510
1511PyObject *
1512PyUnicode_DecodeUTF32(const char *s,
1513 Py_ssize_t size,
1514 const char *errors,
1515 int *byteorder)
1516{
1517 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1518}
1519
1520PyObject *
1521PyUnicode_DecodeUTF32Stateful(const char *s,
1522 Py_ssize_t size,
1523 const char *errors,
1524 int *byteorder,
1525 Py_ssize_t *consumed)
1526{
1527 const char *starts = s;
1528 Py_ssize_t startinpos;
1529 Py_ssize_t endinpos;
1530 Py_ssize_t outpos;
1531 PyUnicodeObject *unicode;
1532 Py_UNICODE *p;
1533#ifndef Py_UNICODE_WIDE
1534 int i, pairs;
1535#else
1536 const int pairs = 0;
1537#endif
1538 const unsigned char *q, *e;
1539 int bo = 0; /* assume native ordering by default */
1540 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00001541 /* Offsets from q for retrieving bytes in the right order. */
1542#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1543 int iorder[] = {0, 1, 2, 3};
1544#else
1545 int iorder[] = {3, 2, 1, 0};
1546#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00001547 PyObject *errorHandler = NULL;
1548 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00001549 /* On narrow builds we split characters outside the BMP into two
1550 codepoints => count how much extra space we need. */
1551#ifndef Py_UNICODE_WIDE
1552 for (i = pairs = 0; i < size/4; i++)
1553 if (((Py_UCS4 *)s)[i] >= 0x10000)
1554 pairs++;
1555#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00001556
1557 /* This might be one to much, because of a BOM */
1558 unicode = _PyUnicode_New((size+3)/4+pairs);
1559 if (!unicode)
1560 return NULL;
1561 if (size == 0)
1562 return (PyObject *)unicode;
1563
1564 /* Unpack UTF-32 encoded data */
1565 p = unicode->str;
1566 q = (unsigned char *)s;
1567 e = q + size;
1568
1569 if (byteorder)
1570 bo = *byteorder;
1571
1572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1576 if (bo == 0) {
1577 if (size >= 4) {
1578 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1579 (q[iorder[1]] << 8) | q[iorder[0]];
1580#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1581 if (bom == 0x0000FEFF) {
1582 q += 4;
1583 bo = -1;
1584 }
1585 else if (bom == 0xFFFE0000) {
1586 q += 4;
1587 bo = 1;
1588 }
1589#else
1590 if (bom == 0x0000FEFF) {
1591 q += 4;
1592 bo = 1;
1593 }
1594 else if (bom == 0xFFFE0000) {
1595 q += 4;
1596 bo = -1;
1597 }
1598#endif
1599 }
1600 }
1601
1602 if (bo == -1) {
1603 /* force LE */
1604 iorder[0] = 0;
1605 iorder[1] = 1;
1606 iorder[2] = 2;
1607 iorder[3] = 3;
1608 }
1609 else if (bo == 1) {
1610 /* force BE */
1611 iorder[0] = 3;
1612 iorder[1] = 2;
1613 iorder[2] = 1;
1614 iorder[3] = 0;
1615 }
1616
1617 while (q < e) {
1618 Py_UCS4 ch;
1619 /* remaining bytes at the end? (size should be divisible by 4) */
1620 if (e-q<4) {
1621 if (consumed)
1622 break;
1623 errmsg = "truncated data";
1624 startinpos = ((const char *)q)-starts;
1625 endinpos = ((const char *)e)-starts;
1626 goto utf32Error;
1627 /* The remaining input chars are ignored if the callback
1628 chooses to skip the input */
1629 }
1630 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1631 (q[iorder[1]] << 8) | q[iorder[0]];
1632
1633 if (ch >= 0x110000)
1634 {
1635 errmsg = "codepoint not in range(0x110000)";
1636 startinpos = ((const char *)q)-starts;
1637 endinpos = startinpos+4;
1638 goto utf32Error;
1639 }
1640#ifndef Py_UNICODE_WIDE
1641 if (ch >= 0x10000)
1642 {
1643 *p++ = 0xD800 | ((ch-0x10000) >> 10);
1644 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
1645 }
1646 else
1647#endif
1648 *p++ = ch;
1649 q += 4;
1650 continue;
1651 utf32Error:
1652 outpos = p-PyUnicode_AS_UNICODE(unicode);
1653 if (unicode_decode_call_errorhandler(
1654 errors, &errorHandler,
1655 "utf32", errmsg,
1656 starts, size, &startinpos, &endinpos, &exc, &s,
1657 (PyObject **)&unicode, &outpos, &p))
1658 goto onError;
1659 }
1660
1661 if (byteorder)
1662 *byteorder = bo;
1663
1664 if (consumed)
1665 *consumed = (const char *)q-starts;
1666
1667 /* Adjust length */
1668 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1669 goto onError;
1670
1671 Py_XDECREF(errorHandler);
1672 Py_XDECREF(exc);
1673 return (PyObject *)unicode;
1674
1675onError:
1676 Py_DECREF(unicode);
1677 Py_XDECREF(errorHandler);
1678 Py_XDECREF(exc);
1679 return NULL;
1680}
1681
1682PyObject *
1683PyUnicode_EncodeUTF32(const Py_UNICODE *s,
1684 Py_ssize_t size,
1685 const char *errors,
1686 int byteorder)
1687{
1688 PyObject *v;
1689 unsigned char *p;
1690#ifndef Py_UNICODE_WIDE
1691 int i, pairs;
1692#else
1693 const int pairs = 0;
1694#endif
1695 /* Offsets from p for storing byte pairs in the right order. */
1696#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1697 int iorder[] = {0, 1, 2, 3};
1698#else
1699 int iorder[] = {3, 2, 1, 0};
1700#endif
1701
1702#define STORECHAR(CH) \
1703 do { \
1704 p[iorder[3]] = ((CH) >> 24) & 0xff; \
1705 p[iorder[2]] = ((CH) >> 16) & 0xff; \
1706 p[iorder[1]] = ((CH) >> 8) & 0xff; \
1707 p[iorder[0]] = (CH) & 0xff; \
1708 p += 4; \
1709 } while(0)
1710
1711 /* In narrow builds we can output surrogate pairs as one codepoint,
1712 so we need less space. */
1713#ifndef Py_UNICODE_WIDE
1714 for (i = pairs = 0; i < size-1; i++)
1715 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
1716 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
1717 pairs++;
1718#endif
1719 v = PyString_FromStringAndSize(NULL,
1720 4 * (size - pairs + (byteorder == 0)));
1721 if (v == NULL)
1722 return NULL;
1723
1724 p = (unsigned char *)PyString_AS_STRING(v);
1725 if (byteorder == 0)
1726 STORECHAR(0xFEFF);
1727 if (size == 0)
1728 return v;
1729
1730 if (byteorder == -1) {
1731 /* force LE */
1732 iorder[0] = 0;
1733 iorder[1] = 1;
1734 iorder[2] = 2;
1735 iorder[3] = 3;
1736 }
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 iorder[0] = 3;
1740 iorder[1] = 2;
1741 iorder[2] = 1;
1742 iorder[3] = 0;
1743 }
1744
1745 while (size-- > 0) {
1746 Py_UCS4 ch = *s++;
1747#ifndef Py_UNICODE_WIDE
1748 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
1749 Py_UCS4 ch2 = *s;
1750 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1751 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1752 s++;
1753 size--;
1754 }
1755 }
1756#endif
1757 STORECHAR(ch);
1758 }
1759 return v;
1760#undef STORECHAR
1761}
1762
1763PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
1764{
1765 if (!PyUnicode_Check(unicode)) {
1766 PyErr_BadArgument();
1767 return NULL;
1768 }
1769 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
1770 PyUnicode_GET_SIZE(unicode),
1771 NULL,
1772 0);
1773}
1774
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775/* --- UTF-16 Codec ------------------------------------------------------- */
1776
Tim Peters772747b2001-08-09 22:21:55 +00001777PyObject *
1778PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001779 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001780 const char *errors,
1781 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782{
Walter Dörwald69652032004-09-07 20:24:22 +00001783 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1784}
1785
1786PyObject *
1787PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001789 const char *errors,
1790 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001791 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001793 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001794 Py_ssize_t startinpos;
1795 Py_ssize_t endinpos;
1796 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 PyUnicodeObject *unicode;
1798 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001799 const unsigned char *q, *e;
1800 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001801 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001802 /* Offsets from q for retrieving byte pairs in the right order. */
1803#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1804 int ihi = 1, ilo = 0;
1805#else
1806 int ihi = 0, ilo = 1;
1807#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 PyObject *errorHandler = NULL;
1809 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810
1811 /* Note: size will always be longer than the resulting Unicode
1812 character count */
1813 unicode = _PyUnicode_New(size);
1814 if (!unicode)
1815 return NULL;
1816 if (size == 0)
1817 return (PyObject *)unicode;
1818
1819 /* Unpack UTF-16 encoded data */
1820 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001821 q = (unsigned char *)s;
1822 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823
1824 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001825 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001827 /* Check for BOM marks (U+FEFF) in the input and adjust current
1828 byte order setting accordingly. In native mode, the leading BOM
1829 mark is skipped, in all other modes, it is copied to the output
1830 stream as-is (giving a ZWNBSP character). */
1831 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001832 if (size >= 2) {
1833 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001834#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001835 if (bom == 0xFEFF) {
1836 q += 2;
1837 bo = -1;
1838 }
1839 else if (bom == 0xFFFE) {
1840 q += 2;
1841 bo = 1;
1842 }
Tim Petersced69f82003-09-16 20:30:58 +00001843#else
Walter Dörwald69652032004-09-07 20:24:22 +00001844 if (bom == 0xFEFF) {
1845 q += 2;
1846 bo = 1;
1847 }
1848 else if (bom == 0xFFFE) {
1849 q += 2;
1850 bo = -1;
1851 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001852#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001853 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Tim Peters772747b2001-08-09 22:21:55 +00001856 if (bo == -1) {
1857 /* force LE */
1858 ihi = 1;
1859 ilo = 0;
1860 }
1861 else if (bo == 1) {
1862 /* force BE */
1863 ihi = 0;
1864 ilo = 1;
1865 }
1866
1867 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001869 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001871 if (consumed)
1872 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 errmsg = "truncated data";
1874 startinpos = ((const char *)q)-starts;
1875 endinpos = ((const char *)e)-starts;
1876 goto utf16Error;
1877 /* The remaining input chars are ignored if the callback
1878 chooses to skip the input */
1879 }
1880 ch = (q[ihi] << 8) | q[ilo];
1881
Tim Peters772747b2001-08-09 22:21:55 +00001882 q += 2;
1883
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884 if (ch < 0xD800 || ch > 0xDFFF) {
1885 *p++ = ch;
1886 continue;
1887 }
1888
1889 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001890 if (q >= e) {
1891 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 startinpos = (((const char *)q)-2)-starts;
1893 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001894 goto utf16Error;
1895 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001896 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001897 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1898 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001899 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001900#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001901 *p++ = ch;
1902 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001903#else
1904 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001906 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 }
1908 else {
1909 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001910 startinpos = (((const char *)q)-4)-starts;
1911 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 goto utf16Error;
1913 }
1914
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001916 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001917 startinpos = (((const char *)q)-2)-starts;
1918 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001919 /* Fall through to report the error */
1920
1921 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 outpos = p-PyUnicode_AS_UNICODE(unicode);
1923 if (unicode_decode_call_errorhandler(
1924 errors, &errorHandler,
1925 "utf16", errmsg,
1926 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1927 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001928 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 }
1930
1931 if (byteorder)
1932 *byteorder = bo;
1933
Walter Dörwald69652032004-09-07 20:24:22 +00001934 if (consumed)
1935 *consumed = (const char *)q-starts;
1936
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001938 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 goto onError;
1940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 Py_XDECREF(errorHandler);
1942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 return (PyObject *)unicode;
1944
1945onError:
1946 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001947 Py_XDECREF(errorHandler);
1948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 return NULL;
1950}
1951
Tim Peters772747b2001-08-09 22:21:55 +00001952PyObject *
1953PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001954 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001955 const char *errors,
1956 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957{
1958 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001959 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001960#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001961 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001962#else
1963 const int pairs = 0;
1964#endif
Tim Peters772747b2001-08-09 22:21:55 +00001965 /* Offsets from p for storing byte pairs in the right order. */
1966#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1967 int ihi = 1, ilo = 0;
1968#else
1969 int ihi = 0, ilo = 1;
1970#endif
1971
1972#define STORECHAR(CH) \
1973 do { \
1974 p[ihi] = ((CH) >> 8) & 0xff; \
1975 p[ilo] = (CH) & 0xff; \
1976 p += 2; \
1977 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001979#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001980 for (i = pairs = 0; i < size; i++)
1981 if (s[i] >= 0x10000)
1982 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001983#endif
Tim Petersced69f82003-09-16 20:30:58 +00001984 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001985 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 if (v == NULL)
1987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988
Tim Peters772747b2001-08-09 22:21:55 +00001989 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001991 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001992 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001993 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001994
1995 if (byteorder == -1) {
1996 /* force LE */
1997 ihi = 1;
1998 ilo = 0;
1999 }
2000 else if (byteorder == 1) {
2001 /* force BE */
2002 ihi = 0;
2003 ilo = 1;
2004 }
2005
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002006 while (size-- > 0) {
2007 Py_UNICODE ch = *s++;
2008 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002009#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002010 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002011 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2012 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002014#endif
Tim Peters772747b2001-08-09 22:21:55 +00002015 STORECHAR(ch);
2016 if (ch2)
2017 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002019 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002020#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021}
2022
2023PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2024{
2025 if (!PyUnicode_Check(unicode)) {
2026 PyErr_BadArgument();
2027 return NULL;
2028 }
2029 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2030 PyUnicode_GET_SIZE(unicode),
2031 NULL,
2032 0);
2033}
2034
2035/* --- Unicode Escape Codec ----------------------------------------------- */
2036
Fredrik Lundh06d12682001-01-24 07:59:11 +00002037static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002038
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002040 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 const char *errors)
2042{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002044 Py_ssize_t startinpos;
2045 Py_ssize_t endinpos;
2046 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002051 char* message;
2052 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 PyObject *errorHandler = NULL;
2054 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002055
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056 /* Escaped strings will always be longer than the resulting
2057 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058 length after conversion to the true value.
2059 (but if the error callback returns a long replacement string
2060 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 v = _PyUnicode_New(size);
2062 if (v == NULL)
2063 goto onError;
2064 if (size == 0)
2065 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002069
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 while (s < end) {
2071 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002072 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002073 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074
2075 /* Non-escape characters are interpreted as Unicode ordinals */
2076 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002077 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 continue;
2079 }
2080
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082 /* \ - Escapes */
2083 s++;
2084 switch (*s++) {
2085
2086 /* \x escapes */
2087 case '\n': break;
2088 case '\\': *p++ = '\\'; break;
2089 case '\'': *p++ = '\''; break;
2090 case '\"': *p++ = '\"'; break;
2091 case 'b': *p++ = '\b'; break;
2092 case 'f': *p++ = '\014'; break; /* FF */
2093 case 't': *p++ = '\t'; break;
2094 case 'n': *p++ = '\n'; break;
2095 case 'r': *p++ = '\r'; break;
2096 case 'v': *p++ = '\013'; break; /* VT */
2097 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2098
2099 /* \OOO (octal) escapes */
2100 case '0': case '1': case '2': case '3':
2101 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002102 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002104 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002106 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002108 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 break;
2110
Fredrik Lundhccc74732001-02-18 22:13:49 +00002111 /* hex escapes */
2112 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002114 digits = 2;
2115 message = "truncated \\xXX escape";
2116 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117
Fredrik Lundhccc74732001-02-18 22:13:49 +00002118 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002120 digits = 4;
2121 message = "truncated \\uXXXX escape";
2122 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123
Fredrik Lundhccc74732001-02-18 22:13:49 +00002124 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002125 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002126 digits = 8;
2127 message = "truncated \\UXXXXXXXX escape";
2128 hexescape:
2129 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 outpos = p-PyUnicode_AS_UNICODE(v);
2131 if (s+digits>end) {
2132 endinpos = size;
2133 if (unicode_decode_call_errorhandler(
2134 errors, &errorHandler,
2135 "unicodeescape", "end of string in escape sequence",
2136 starts, size, &startinpos, &endinpos, &exc, &s,
2137 (PyObject **)&v, &outpos, &p))
2138 goto onError;
2139 goto nextByte;
2140 }
2141 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002142 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002143 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002144 endinpos = (s+i+1)-starts;
2145 if (unicode_decode_call_errorhandler(
2146 errors, &errorHandler,
2147 "unicodeescape", message,
2148 starts, size, &startinpos, &endinpos, &exc, &s,
2149 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002150 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002152 }
2153 chr = (chr<<4) & ~0xF;
2154 if (c >= '0' && c <= '9')
2155 chr += c - '0';
2156 else if (c >= 'a' && c <= 'f')
2157 chr += 10 + c - 'a';
2158 else
2159 chr += 10 + c - 'A';
2160 }
2161 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002162 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002163 /* _decoding_error will have already written into the
2164 target buffer. */
2165 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002166 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002167 /* when we get here, chr is a 32-bit unicode character */
2168 if (chr <= 0xffff)
2169 /* UCS-2 character */
2170 *p++ = (Py_UNICODE) chr;
2171 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002172 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002173 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002174#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002175 *p++ = chr;
2176#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002177 chr -= 0x10000L;
2178 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002179 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002180#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002181 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 endinpos = s-starts;
2183 outpos = p-PyUnicode_AS_UNICODE(v);
2184 if (unicode_decode_call_errorhandler(
2185 errors, &errorHandler,
2186 "unicodeescape", "illegal Unicode character",
2187 starts, size, &startinpos, &endinpos, &exc, &s,
2188 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002189 goto onError;
2190 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002191 break;
2192
2193 /* \N{name} */
2194 case 'N':
2195 message = "malformed \\N character escape";
2196 if (ucnhash_CAPI == NULL) {
2197 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002198 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002199 m = PyImport_ImportModule("unicodedata");
2200 if (m == NULL)
2201 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002202 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002203 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002204 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002205 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002206 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002207 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002208 if (ucnhash_CAPI == NULL)
2209 goto ucnhashError;
2210 }
2211 if (*s == '{') {
2212 const char *start = s+1;
2213 /* look for the closing brace */
2214 while (*s != '}' && s < end)
2215 s++;
2216 if (s > start && s < end && *s == '}') {
2217 /* found a name. look it up in the unicode database */
2218 message = "unknown Unicode character name";
2219 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002220 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002221 goto store;
2222 }
2223 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002224 endinpos = s-starts;
2225 outpos = p-PyUnicode_AS_UNICODE(v);
2226 if (unicode_decode_call_errorhandler(
2227 errors, &errorHandler,
2228 "unicodeescape", message,
2229 starts, size, &startinpos, &endinpos, &exc, &s,
2230 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002231 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002232 break;
2233
2234 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002235 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002236 message = "\\ at end of string";
2237 s--;
2238 endinpos = s-starts;
2239 outpos = p-PyUnicode_AS_UNICODE(v);
2240 if (unicode_decode_call_errorhandler(
2241 errors, &errorHandler,
2242 "unicodeescape", message,
2243 starts, size, &startinpos, &endinpos, &exc, &s,
2244 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002245 goto onError;
2246 }
2247 else {
2248 *p++ = '\\';
2249 *p++ = (unsigned char)s[-1];
2250 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002251 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002253 nextByte:
2254 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002256 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002258 Py_XDECREF(errorHandler);
2259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002261
Fredrik Lundhccc74732001-02-18 22:13:49 +00002262ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002263 PyErr_SetString(
2264 PyExc_UnicodeError,
2265 "\\N escapes not supported (can't load unicodedata module)"
2266 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002267 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 Py_XDECREF(errorHandler);
2269 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002270 return NULL;
2271
Fredrik Lundhccc74732001-02-18 22:13:49 +00002272onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 Py_XDECREF(errorHandler);
2275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 return NULL;
2277}
2278
2279/* Return a Unicode-Escape string version of the Unicode object.
2280
2281 If quotes is true, the string is enclosed in u"" or u'' quotes as
2282 appropriate.
2283
2284*/
2285
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002286Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002287 Py_ssize_t size,
2288 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002289{
2290 /* like wcschr, but doesn't stop at NULL characters */
2291
2292 while (size-- > 0) {
2293 if (*s == ch)
2294 return s;
2295 s++;
2296 }
2297
2298 return NULL;
2299}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002300
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301static
2302PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002303 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 int quotes)
2305{
2306 PyObject *repr;
2307 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002309 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310
Neal Norwitz17753ec2006-08-21 22:21:19 +00002311 /* XXX(nnorwitz): rather than over-allocating, it would be
2312 better to choose a different scheme. Perhaps scan the
2313 first N-chars of the string and allocate based on that size.
2314 */
2315 /* Initial allocation is based on the longest-possible unichr
2316 escape.
2317
2318 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2319 unichr, so in this case it's the longest unichr escape. In
2320 narrow (UTF-16) builds this is five chars per source unichr
2321 since there are two unichrs in the surrogate pair, so in narrow
2322 (UTF-16) builds it's not the longest unichr escape.
2323
2324 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2325 so in the narrow (UTF-16) build case it's the longest unichr
2326 escape.
2327 */
2328
2329 repr = PyString_FromStringAndSize(NULL,
2330 2
2331#ifdef Py_UNICODE_WIDE
2332 + 10*size
2333#else
2334 + 6*size
2335#endif
2336 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 if (repr == NULL)
2338 return NULL;
2339
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002340 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341
2342 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002344 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345 !findchar(s, size, '"')) ? '"' : '\'';
2346 }
2347 while (size-- > 0) {
2348 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002349
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002350 /* Escape quotes and backslashes */
2351 if ((quotes &&
2352 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 *p++ = '\\';
2354 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002355 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002356 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002357
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002358#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002359 /* Map 21-bit characters to '\U00xxxxxx' */
2360 else if (ch >= 0x10000) {
2361 *p++ = '\\';
2362 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002363 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2364 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2365 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2366 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2367 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2368 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2369 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002370 *p++ = hexdigit[ch & 0x0000000F];
2371 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002372 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002373#else
2374 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002375 else if (ch >= 0xD800 && ch < 0xDC00) {
2376 Py_UNICODE ch2;
2377 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002378
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002379 ch2 = *s++;
2380 size--;
2381 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2382 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2383 *p++ = '\\';
2384 *p++ = 'U';
2385 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2386 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2387 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2388 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2389 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2390 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2391 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2392 *p++ = hexdigit[ucs & 0x0000000F];
2393 continue;
2394 }
2395 /* Fall through: isolated surrogates are copied as-is */
2396 s--;
2397 size++;
2398 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002399#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002400
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002402 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 *p++ = '\\';
2404 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002405 *p++ = hexdigit[(ch >> 12) & 0x000F];
2406 *p++ = hexdigit[(ch >> 8) & 0x000F];
2407 *p++ = hexdigit[(ch >> 4) & 0x000F];
2408 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002410
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002411 /* Map special whitespace to '\t', \n', '\r' */
2412 else if (ch == '\t') {
2413 *p++ = '\\';
2414 *p++ = 't';
2415 }
2416 else if (ch == '\n') {
2417 *p++ = '\\';
2418 *p++ = 'n';
2419 }
2420 else if (ch == '\r') {
2421 *p++ = '\\';
2422 *p++ = 'r';
2423 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002424
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002425 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002426 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002428 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002429 *p++ = hexdigit[(ch >> 4) & 0x000F];
2430 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002431 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002432
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433 /* Copy everything else as-is */
2434 else
2435 *p++ = (char) ch;
2436 }
2437 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002438 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002439
2440 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002441 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 return repr;
2443}
2444
2445PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002446 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447{
2448 return unicodeescape_string(s, size, 0);
2449}
2450
2451PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2452{
2453 if (!PyUnicode_Check(unicode)) {
2454 PyErr_BadArgument();
2455 return NULL;
2456 }
2457 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2458 PyUnicode_GET_SIZE(unicode));
2459}
2460
2461/* --- Raw Unicode Escape Codec ------------------------------------------- */
2462
2463PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002464 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 const char *errors)
2466{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002467 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002468 Py_ssize_t startinpos;
2469 Py_ssize_t endinpos;
2470 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002472 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473 const char *end;
2474 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 PyObject *errorHandler = NULL;
2476 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002477
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 /* Escaped strings will always be longer than the resulting
2479 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 length after conversion to the true value. (But decoding error
2481 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 v = _PyUnicode_New(size);
2483 if (v == NULL)
2484 goto onError;
2485 if (size == 0)
2486 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 end = s + size;
2489 while (s < end) {
2490 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002491 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002493 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494
2495 /* Non-escape characters are interpreted as Unicode ordinals */
2496 if (*s != '\\') {
2497 *p++ = (unsigned char)*s++;
2498 continue;
2499 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501
2502 /* \u-escapes are only interpreted iff the number of leading
2503 backslashes if odd */
2504 bs = s;
2505 for (;s < end;) {
2506 if (*s != '\\')
2507 break;
2508 *p++ = (unsigned char)*s++;
2509 }
2510 if (((s - bs) & 1) == 0 ||
2511 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002512 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 continue;
2514 }
2515 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002516 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 s++;
2518
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002519 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002520 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002521 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002522 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002524 endinpos = s-starts;
2525 if (unicode_decode_call_errorhandler(
2526 errors, &errorHandler,
2527 "rawunicodeescape", "truncated \\uXXXX",
2528 starts, size, &startinpos, &endinpos, &exc, &s,
2529 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002531 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 }
2533 x = (x<<4) & ~0xF;
2534 if (c >= '0' && c <= '9')
2535 x += c - '0';
2536 else if (c >= 'a' && c <= 'f')
2537 x += 10 + c - 'a';
2538 else
2539 x += 10 + c - 'A';
2540 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002541#ifndef Py_UNICODE_WIDE
2542 if (x > 0x10000) {
2543 if (unicode_decode_call_errorhandler(
2544 errors, &errorHandler,
2545 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2546 starts, size, &startinpos, &endinpos, &exc, &s,
2547 (PyObject **)&v, &outpos, &p))
2548 goto onError;
2549 }
2550#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002551 *p++ = x;
2552 nextByte:
2553 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002555 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002556 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002557 Py_XDECREF(errorHandler);
2558 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002560
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 onError:
2562 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002563 Py_XDECREF(errorHandler);
2564 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565 return NULL;
2566}
2567
2568PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002569 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570{
2571 PyObject *repr;
2572 char *p;
2573 char *q;
2574
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002575 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002577#ifdef Py_UNICODE_WIDE
2578 repr = PyString_FromStringAndSize(NULL, 10 * size);
2579#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002581#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 if (repr == NULL)
2583 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002584 if (size == 0)
2585 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586
2587 p = q = PyString_AS_STRING(repr);
2588 while (size-- > 0) {
2589 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002590#ifdef Py_UNICODE_WIDE
2591 /* Map 32-bit characters to '\Uxxxxxxxx' */
2592 if (ch >= 0x10000) {
2593 *p++ = '\\';
2594 *p++ = 'U';
2595 *p++ = hexdigit[(ch >> 28) & 0xf];
2596 *p++ = hexdigit[(ch >> 24) & 0xf];
2597 *p++ = hexdigit[(ch >> 20) & 0xf];
2598 *p++ = hexdigit[(ch >> 16) & 0xf];
2599 *p++ = hexdigit[(ch >> 12) & 0xf];
2600 *p++ = hexdigit[(ch >> 8) & 0xf];
2601 *p++ = hexdigit[(ch >> 4) & 0xf];
2602 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002603 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002604 else
2605#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 /* Map 16-bit characters to '\uxxxx' */
2607 if (ch >= 256) {
2608 *p++ = '\\';
2609 *p++ = 'u';
2610 *p++ = hexdigit[(ch >> 12) & 0xf];
2611 *p++ = hexdigit[(ch >> 8) & 0xf];
2612 *p++ = hexdigit[(ch >> 4) & 0xf];
2613 *p++ = hexdigit[ch & 15];
2614 }
2615 /* Copy everything else as-is */
2616 else
2617 *p++ = (char) ch;
2618 }
2619 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002620 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 return repr;
2622}
2623
2624PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2625{
2626 if (!PyUnicode_Check(unicode)) {
2627 PyErr_BadArgument();
2628 return NULL;
2629 }
2630 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2631 PyUnicode_GET_SIZE(unicode));
2632}
2633
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002634/* --- Unicode Internal Codec ------------------------------------------- */
2635
2636PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002637 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002638 const char *errors)
2639{
2640 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002641 Py_ssize_t startinpos;
2642 Py_ssize_t endinpos;
2643 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002644 PyUnicodeObject *v;
2645 Py_UNICODE *p;
2646 const char *end;
2647 const char *reason;
2648 PyObject *errorHandler = NULL;
2649 PyObject *exc = NULL;
2650
Neal Norwitzd43069c2006-01-08 01:12:10 +00002651#ifdef Py_UNICODE_WIDE
2652 Py_UNICODE unimax = PyUnicode_GetMax();
2653#endif
2654
Armin Rigo7ccbca92006-10-04 12:17:45 +00002655 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002656 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2657 if (v == NULL)
2658 goto onError;
2659 if (PyUnicode_GetSize((PyObject *)v) == 0)
2660 return (PyObject *)v;
2661 p = PyUnicode_AS_UNICODE(v);
2662 end = s + size;
2663
2664 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002665 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002666 /* We have to sanity check the raw data, otherwise doom looms for
2667 some malformed UCS-4 data. */
2668 if (
2669 #ifdef Py_UNICODE_WIDE
2670 *p > unimax || *p < 0 ||
2671 #endif
2672 end-s < Py_UNICODE_SIZE
2673 )
2674 {
2675 startinpos = s - starts;
2676 if (end-s < Py_UNICODE_SIZE) {
2677 endinpos = end-starts;
2678 reason = "truncated input";
2679 }
2680 else {
2681 endinpos = s - starts + Py_UNICODE_SIZE;
2682 reason = "illegal code point (> 0x10FFFF)";
2683 }
2684 outpos = p - PyUnicode_AS_UNICODE(v);
2685 if (unicode_decode_call_errorhandler(
2686 errors, &errorHandler,
2687 "unicode_internal", reason,
2688 starts, size, &startinpos, &endinpos, &exc, &s,
2689 (PyObject **)&v, &outpos, &p)) {
2690 goto onError;
2691 }
2692 }
2693 else {
2694 p++;
2695 s += Py_UNICODE_SIZE;
2696 }
2697 }
2698
Martin v. Löwis412fb672006-04-13 06:34:32 +00002699 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002700 goto onError;
2701 Py_XDECREF(errorHandler);
2702 Py_XDECREF(exc);
2703 return (PyObject *)v;
2704
2705 onError:
2706 Py_XDECREF(v);
2707 Py_XDECREF(errorHandler);
2708 Py_XDECREF(exc);
2709 return NULL;
2710}
2711
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712/* --- Latin-1 Codec ------------------------------------------------------ */
2713
2714PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002715 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 const char *errors)
2717{
2718 PyUnicodeObject *v;
2719 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002720
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002722 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002723 Py_UNICODE r = *(unsigned char*)s;
2724 return PyUnicode_FromUnicode(&r, 1);
2725 }
2726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 v = _PyUnicode_New(size);
2728 if (v == NULL)
2729 goto onError;
2730 if (size == 0)
2731 return (PyObject *)v;
2732 p = PyUnicode_AS_UNICODE(v);
2733 while (size-- > 0)
2734 *p++ = (unsigned char)*s++;
2735 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002736
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737 onError:
2738 Py_XDECREF(v);
2739 return NULL;
2740}
2741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002742/* create or adjust a UnicodeEncodeError */
2743static void make_encode_exception(PyObject **exceptionObject,
2744 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002745 const Py_UNICODE *unicode, Py_ssize_t size,
2746 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 if (*exceptionObject == NULL) {
2750 *exceptionObject = PyUnicodeEncodeError_Create(
2751 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 }
2753 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002754 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2755 goto onError;
2756 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2757 goto onError;
2758 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2759 goto onError;
2760 return;
2761 onError:
2762 Py_DECREF(*exceptionObject);
2763 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
2765}
2766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767/* raises a UnicodeEncodeError */
2768static void raise_encode_exception(PyObject **exceptionObject,
2769 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002770 const Py_UNICODE *unicode, Py_ssize_t size,
2771 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 const char *reason)
2773{
2774 make_encode_exception(exceptionObject,
2775 encoding, unicode, size, startpos, endpos, reason);
2776 if (*exceptionObject != NULL)
2777 PyCodec_StrictErrors(*exceptionObject);
2778}
2779
2780/* error handling callback helper:
2781 build arguments, call the callback and check the arguments,
2782 put the result into newpos and return the replacement string, which
2783 has to be freed by the caller */
2784static PyObject *unicode_encode_call_errorhandler(const char *errors,
2785 PyObject **errorHandler,
2786 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002787 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2788 Py_ssize_t startpos, Py_ssize_t endpos,
2789 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002791 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002792
2793 PyObject *restuple;
2794 PyObject *resunicode;
2795
2796 if (*errorHandler == NULL) {
2797 *errorHandler = PyCodec_LookupError(errors);
2798 if (*errorHandler == NULL)
2799 return NULL;
2800 }
2801
2802 make_encode_exception(exceptionObject,
2803 encoding, unicode, size, startpos, endpos, reason);
2804 if (*exceptionObject == NULL)
2805 return NULL;
2806
2807 restuple = PyObject_CallFunctionObjArgs(
2808 *errorHandler, *exceptionObject, NULL);
2809 if (restuple == NULL)
2810 return NULL;
2811 if (!PyTuple_Check(restuple)) {
2812 PyErr_Format(PyExc_TypeError, &argparse[4]);
2813 Py_DECREF(restuple);
2814 return NULL;
2815 }
2816 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2817 &resunicode, newpos)) {
2818 Py_DECREF(restuple);
2819 return NULL;
2820 }
2821 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002822 *newpos = size+*newpos;
2823 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002824 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002825 Py_DECREF(restuple);
2826 return NULL;
2827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002828 Py_INCREF(resunicode);
2829 Py_DECREF(restuple);
2830 return resunicode;
2831}
2832
2833static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002834 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 const char *errors,
2836 int limit)
2837{
2838 /* output object */
2839 PyObject *res;
2840 /* pointers to the beginning and end+1 of input */
2841 const Py_UNICODE *startp = p;
2842 const Py_UNICODE *endp = p + size;
2843 /* pointer to the beginning of the unencodable characters */
2844 /* const Py_UNICODE *badp = NULL; */
2845 /* pointer into the output */
2846 char *str;
2847 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002848 Py_ssize_t respos = 0;
2849 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002850 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2851 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 PyObject *errorHandler = NULL;
2853 PyObject *exc = NULL;
2854 /* the following variable is used for caching string comparisons
2855 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2856 int known_errorHandler = -1;
2857
2858 /* allocate enough for a simple encoding without
2859 replacements, if we need more, we'll resize */
2860 res = PyString_FromStringAndSize(NULL, size);
2861 if (res == NULL)
2862 goto onError;
2863 if (size == 0)
2864 return res;
2865 str = PyString_AS_STRING(res);
2866 ressize = size;
2867
2868 while (p<endp) {
2869 Py_UNICODE c = *p;
2870
2871 /* can we encode this? */
2872 if (c<limit) {
2873 /* no overflow check, because we know that the space is enough */
2874 *str++ = (char)c;
2875 ++p;
2876 }
2877 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002878 Py_ssize_t unicodepos = p-startp;
2879 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002880 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002881 Py_ssize_t repsize;
2882 Py_ssize_t newpos;
2883 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002884 Py_UNICODE *uni2;
2885 /* startpos for collecting unencodable chars */
2886 const Py_UNICODE *collstart = p;
2887 const Py_UNICODE *collend = p;
2888 /* find all unecodable characters */
2889 while ((collend < endp) && ((*collend)>=limit))
2890 ++collend;
2891 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2892 if (known_errorHandler==-1) {
2893 if ((errors==NULL) || (!strcmp(errors, "strict")))
2894 known_errorHandler = 1;
2895 else if (!strcmp(errors, "replace"))
2896 known_errorHandler = 2;
2897 else if (!strcmp(errors, "ignore"))
2898 known_errorHandler = 3;
2899 else if (!strcmp(errors, "xmlcharrefreplace"))
2900 known_errorHandler = 4;
2901 else
2902 known_errorHandler = 0;
2903 }
2904 switch (known_errorHandler) {
2905 case 1: /* strict */
2906 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2907 goto onError;
2908 case 2: /* replace */
2909 while (collstart++<collend)
2910 *str++ = '?'; /* fall through */
2911 case 3: /* ignore */
2912 p = collend;
2913 break;
2914 case 4: /* xmlcharrefreplace */
2915 respos = str-PyString_AS_STRING(res);
2916 /* determine replacement size (temporarily (mis)uses p) */
2917 for (p = collstart, repsize = 0; p < collend; ++p) {
2918 if (*p<10)
2919 repsize += 2+1+1;
2920 else if (*p<100)
2921 repsize += 2+2+1;
2922 else if (*p<1000)
2923 repsize += 2+3+1;
2924 else if (*p<10000)
2925 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002926#ifndef Py_UNICODE_WIDE
2927 else
2928 repsize += 2+5+1;
2929#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002930 else if (*p<100000)
2931 repsize += 2+5+1;
2932 else if (*p<1000000)
2933 repsize += 2+6+1;
2934 else
2935 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002936#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 }
2938 requiredsize = respos+repsize+(endp-collend);
2939 if (requiredsize > ressize) {
2940 if (requiredsize<2*ressize)
2941 requiredsize = 2*ressize;
2942 if (_PyString_Resize(&res, requiredsize))
2943 goto onError;
2944 str = PyString_AS_STRING(res) + respos;
2945 ressize = requiredsize;
2946 }
2947 /* generate replacement (temporarily (mis)uses p) */
2948 for (p = collstart; p < collend; ++p) {
2949 str += sprintf(str, "&#%d;", (int)*p);
2950 }
2951 p = collend;
2952 break;
2953 default:
2954 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2955 encoding, reason, startp, size, &exc,
2956 collstart-startp, collend-startp, &newpos);
2957 if (repunicode == NULL)
2958 goto onError;
2959 /* need more space? (at least enough for what we
2960 have+the replacement+the rest of the string, so
2961 we won't have to check space for encodable characters) */
2962 respos = str-PyString_AS_STRING(res);
2963 repsize = PyUnicode_GET_SIZE(repunicode);
2964 requiredsize = respos+repsize+(endp-collend);
2965 if (requiredsize > ressize) {
2966 if (requiredsize<2*ressize)
2967 requiredsize = 2*ressize;
2968 if (_PyString_Resize(&res, requiredsize)) {
2969 Py_DECREF(repunicode);
2970 goto onError;
2971 }
2972 str = PyString_AS_STRING(res) + respos;
2973 ressize = requiredsize;
2974 }
2975 /* check if there is anything unencodable in the replacement
2976 and copy it to the output */
2977 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2978 c = *uni2;
2979 if (c >= limit) {
2980 raise_encode_exception(&exc, encoding, startp, size,
2981 unicodepos, unicodepos+1, reason);
2982 Py_DECREF(repunicode);
2983 goto onError;
2984 }
2985 *str = (char)c;
2986 }
2987 p = startp + newpos;
2988 Py_DECREF(repunicode);
2989 }
2990 }
2991 }
2992 /* Resize if we allocated to much */
2993 respos = str-PyString_AS_STRING(res);
2994 if (respos<ressize)
2995 /* If this falls res will be NULL */
2996 _PyString_Resize(&res, respos);
2997 Py_XDECREF(errorHandler);
2998 Py_XDECREF(exc);
2999 return res;
3000
3001 onError:
3002 Py_XDECREF(res);
3003 Py_XDECREF(errorHandler);
3004 Py_XDECREF(exc);
3005 return NULL;
3006}
3007
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003009 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010 const char *errors)
3011{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013}
3014
3015PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3016{
3017 if (!PyUnicode_Check(unicode)) {
3018 PyErr_BadArgument();
3019 return NULL;
3020 }
3021 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3022 PyUnicode_GET_SIZE(unicode),
3023 NULL);
3024}
3025
3026/* --- 7-bit ASCII Codec -------------------------------------------------- */
3027
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003029 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 const char *errors)
3031{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 PyUnicodeObject *v;
3034 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003035 Py_ssize_t startinpos;
3036 Py_ssize_t endinpos;
3037 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 const char *e;
3039 PyObject *errorHandler = NULL;
3040 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003041
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003043 if (size == 1 && *(unsigned char*)s < 128) {
3044 Py_UNICODE r = *(unsigned char*)s;
3045 return PyUnicode_FromUnicode(&r, 1);
3046 }
Tim Petersced69f82003-09-16 20:30:58 +00003047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 v = _PyUnicode_New(size);
3049 if (v == NULL)
3050 goto onError;
3051 if (size == 0)
3052 return (PyObject *)v;
3053 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 e = s + size;
3055 while (s < e) {
3056 register unsigned char c = (unsigned char)*s;
3057 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 ++s;
3060 }
3061 else {
3062 startinpos = s-starts;
3063 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003064 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 if (unicode_decode_call_errorhandler(
3066 errors, &errorHandler,
3067 "ascii", "ordinal not in range(128)",
3068 starts, size, &startinpos, &endinpos, &exc, &s,
3069 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003073 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003074 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 Py_XDECREF(errorHandler);
3077 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003079
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 onError:
3081 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082 Py_XDECREF(errorHandler);
3083 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 return NULL;
3085}
3086
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003088 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 const char *errors)
3090{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092}
3093
3094PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3095{
3096 if (!PyUnicode_Check(unicode)) {
3097 PyErr_BadArgument();
3098 return NULL;
3099 }
3100 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3101 PyUnicode_GET_SIZE(unicode),
3102 NULL);
3103}
3104
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003105#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003106
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003107/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003108
Martin v. Löwisd8251432006-06-14 05:21:04 +00003109#if SIZEOF_INT < SIZEOF_SSIZE_T
3110#define NEED_RETRY
3111#endif
3112
3113/* XXX This code is limited to "true" double-byte encodings, as
3114 a) it assumes an incomplete character consists of a single byte, and
3115 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3116 encodings, see IsDBCSLeadByteEx documentation. */
3117
3118static int is_dbcs_lead_byte(const char *s, int offset)
3119{
3120 const char *curr = s + offset;
3121
3122 if (IsDBCSLeadByte(*curr)) {
3123 const char *prev = CharPrev(s, curr);
3124 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3125 }
3126 return 0;
3127}
3128
3129/*
3130 * Decode MBCS string into unicode object. If 'final' is set, converts
3131 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3132 */
3133static int decode_mbcs(PyUnicodeObject **v,
3134 const char *s, /* MBCS string */
3135 int size, /* sizeof MBCS string */
3136 int final)
3137{
3138 Py_UNICODE *p;
3139 Py_ssize_t n = 0;
3140 int usize = 0;
3141
3142 assert(size >= 0);
3143
3144 /* Skip trailing lead-byte unless 'final' is set */
3145 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3146 --size;
3147
3148 /* First get the size of the result */
3149 if (size > 0) {
3150 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3151 if (usize == 0) {
3152 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3153 return -1;
3154 }
3155 }
3156
3157 if (*v == NULL) {
3158 /* Create unicode object */
3159 *v = _PyUnicode_New(usize);
3160 if (*v == NULL)
3161 return -1;
3162 }
3163 else {
3164 /* Extend unicode object */
3165 n = PyUnicode_GET_SIZE(*v);
3166 if (_PyUnicode_Resize(v, n + usize) < 0)
3167 return -1;
3168 }
3169
3170 /* Do the conversion */
3171 if (size > 0) {
3172 p = PyUnicode_AS_UNICODE(*v) + n;
3173 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3174 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3175 return -1;
3176 }
3177 }
3178
3179 return size;
3180}
3181
3182PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3183 Py_ssize_t size,
3184 const char *errors,
3185 Py_ssize_t *consumed)
3186{
3187 PyUnicodeObject *v = NULL;
3188 int done;
3189
3190 if (consumed)
3191 *consumed = 0;
3192
3193#ifdef NEED_RETRY
3194 retry:
3195 if (size > INT_MAX)
3196 done = decode_mbcs(&v, s, INT_MAX, 0);
3197 else
3198#endif
3199 done = decode_mbcs(&v, s, (int)size, !consumed);
3200
3201 if (done < 0) {
3202 Py_XDECREF(v);
3203 return NULL;
3204 }
3205
3206 if (consumed)
3207 *consumed += done;
3208
3209#ifdef NEED_RETRY
3210 if (size > INT_MAX) {
3211 s += done;
3212 size -= done;
3213 goto retry;
3214 }
3215#endif
3216
3217 return (PyObject *)v;
3218}
3219
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003220PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003221 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003222 const char *errors)
3223{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003224 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3225}
3226
3227/*
3228 * Convert unicode into string object (MBCS).
3229 * Returns 0 if succeed, -1 otherwise.
3230 */
3231static int encode_mbcs(PyObject **repr,
3232 const Py_UNICODE *p, /* unicode */
3233 int size) /* size of unicode */
3234{
3235 int mbcssize = 0;
3236 Py_ssize_t n = 0;
3237
3238 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003239
3240 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003241 if (size > 0) {
3242 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3243 if (mbcssize == 0) {
3244 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3245 return -1;
3246 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003247 }
3248
Martin v. Löwisd8251432006-06-14 05:21:04 +00003249 if (*repr == NULL) {
3250 /* Create string object */
3251 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3252 if (*repr == NULL)
3253 return -1;
3254 }
3255 else {
3256 /* Extend string object */
3257 n = PyString_Size(*repr);
3258 if (_PyString_Resize(repr, n + mbcssize) < 0)
3259 return -1;
3260 }
3261
3262 /* Do the conversion */
3263 if (size > 0) {
3264 char *s = PyString_AS_STRING(*repr) + n;
3265 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3266 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3267 return -1;
3268 }
3269 }
3270
3271 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003272}
3273
3274PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003275 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003276 const char *errors)
3277{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003278 PyObject *repr = NULL;
3279 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003280
Martin v. Löwisd8251432006-06-14 05:21:04 +00003281#ifdef NEED_RETRY
3282 retry:
3283 if (size > INT_MAX)
3284 ret = encode_mbcs(&repr, p, INT_MAX);
3285 else
3286#endif
3287 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003288
Martin v. Löwisd8251432006-06-14 05:21:04 +00003289 if (ret < 0) {
3290 Py_XDECREF(repr);
3291 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003292 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003293
3294#ifdef NEED_RETRY
3295 if (size > INT_MAX) {
3296 p += INT_MAX;
3297 size -= INT_MAX;
3298 goto retry;
3299 }
3300#endif
3301
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003302 return repr;
3303}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003304
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003305PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3306{
3307 if (!PyUnicode_Check(unicode)) {
3308 PyErr_BadArgument();
3309 return NULL;
3310 }
3311 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3312 PyUnicode_GET_SIZE(unicode),
3313 NULL);
3314}
3315
Martin v. Löwisd8251432006-06-14 05:21:04 +00003316#undef NEED_RETRY
3317
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003318#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320/* --- Character Mapping Codec -------------------------------------------- */
3321
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003323 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324 PyObject *mapping,
3325 const char *errors)
3326{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003327 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003328 Py_ssize_t startinpos;
3329 Py_ssize_t endinpos;
3330 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003331 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 PyUnicodeObject *v;
3333 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003334 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 PyObject *errorHandler = NULL;
3336 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003337 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003338 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003339
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 /* Default to Latin-1 */
3341 if (mapping == NULL)
3342 return PyUnicode_DecodeLatin1(s, size, errors);
3343
3344 v = _PyUnicode_New(size);
3345 if (v == NULL)
3346 goto onError;
3347 if (size == 0)
3348 return (PyObject *)v;
3349 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003351 if (PyUnicode_CheckExact(mapping)) {
3352 mapstring = PyUnicode_AS_UNICODE(mapping);
3353 maplen = PyUnicode_GET_SIZE(mapping);
3354 while (s < e) {
3355 unsigned char ch = *s;
3356 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003358 if (ch < maplen)
3359 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003361 if (x == 0xfffe) {
3362 /* undefined mapping */
3363 outpos = p-PyUnicode_AS_UNICODE(v);
3364 startinpos = s-starts;
3365 endinpos = startinpos+1;
3366 if (unicode_decode_call_errorhandler(
3367 errors, &errorHandler,
3368 "charmap", "character maps to <undefined>",
3369 starts, size, &startinpos, &endinpos, &exc, &s,
3370 (PyObject **)&v, &outpos, &p)) {
3371 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003372 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003373 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003374 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003375 *p++ = x;
3376 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003378 }
3379 else {
3380 while (s < e) {
3381 unsigned char ch = *s;
3382 PyObject *w, *x;
3383
3384 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3385 w = PyInt_FromLong((long)ch);
3386 if (w == NULL)
3387 goto onError;
3388 x = PyObject_GetItem(mapping, w);
3389 Py_DECREF(w);
3390 if (x == NULL) {
3391 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3392 /* No mapping found means: mapping is undefined. */
3393 PyErr_Clear();
3394 x = Py_None;
3395 Py_INCREF(x);
3396 } else
3397 goto onError;
3398 }
3399
3400 /* Apply mapping */
3401 if (PyInt_Check(x)) {
3402 long value = PyInt_AS_LONG(x);
3403 if (value < 0 || value > 65535) {
3404 PyErr_SetString(PyExc_TypeError,
3405 "character mapping must be in range(65536)");
3406 Py_DECREF(x);
3407 goto onError;
3408 }
3409 *p++ = (Py_UNICODE)value;
3410 }
3411 else if (x == Py_None) {
3412 /* undefined mapping */
3413 outpos = p-PyUnicode_AS_UNICODE(v);
3414 startinpos = s-starts;
3415 endinpos = startinpos+1;
3416 if (unicode_decode_call_errorhandler(
3417 errors, &errorHandler,
3418 "charmap", "character maps to <undefined>",
3419 starts, size, &startinpos, &endinpos, &exc, &s,
3420 (PyObject **)&v, &outpos, &p)) {
3421 Py_DECREF(x);
3422 goto onError;
3423 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003424 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003425 continue;
3426 }
3427 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003428 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003429
3430 if (targetsize == 1)
3431 /* 1-1 mapping */
3432 *p++ = *PyUnicode_AS_UNICODE(x);
3433
3434 else if (targetsize > 1) {
3435 /* 1-n mapping */
3436 if (targetsize > extrachars) {
3437 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003438 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3439 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003440 (targetsize << 2);
3441 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003442 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003443 if (_PyUnicode_Resize(&v,
3444 PyUnicode_GET_SIZE(v) + needed) < 0) {
3445 Py_DECREF(x);
3446 goto onError;
3447 }
3448 p = PyUnicode_AS_UNICODE(v) + oldpos;
3449 }
3450 Py_UNICODE_COPY(p,
3451 PyUnicode_AS_UNICODE(x),
3452 targetsize);
3453 p += targetsize;
3454 extrachars -= targetsize;
3455 }
3456 /* 1-0 mapping: skip the character */
3457 }
3458 else {
3459 /* wrong return value */
3460 PyErr_SetString(PyExc_TypeError,
3461 "character mapping must return integer, None or unicode");
3462 Py_DECREF(x);
3463 goto onError;
3464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003466 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
3469 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003470 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 Py_XDECREF(errorHandler);
3473 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003475
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 Py_XDECREF(errorHandler);
3478 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 Py_XDECREF(v);
3480 return NULL;
3481}
3482
Martin v. Löwis3f767792006-06-04 19:36:28 +00003483/* Charmap encoding: the lookup table */
3484
3485struct encoding_map{
3486 PyObject_HEAD
3487 unsigned char level1[32];
3488 int count2, count3;
3489 unsigned char level23[1];
3490};
3491
3492static PyObject*
3493encoding_map_size(PyObject *obj, PyObject* args)
3494{
3495 struct encoding_map *map = (struct encoding_map*)obj;
3496 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3497 128*map->count3);
3498}
3499
3500static PyMethodDef encoding_map_methods[] = {
3501 {"size", encoding_map_size, METH_NOARGS,
3502 PyDoc_STR("Return the size (in bytes) of this object") },
3503 { 0 }
3504};
3505
3506static void
3507encoding_map_dealloc(PyObject* o)
3508{
3509 PyObject_FREE(o);
3510}
3511
3512static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00003513 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003514 "EncodingMap", /*tp_name*/
3515 sizeof(struct encoding_map), /*tp_basicsize*/
3516 0, /*tp_itemsize*/
3517 /* methods */
3518 encoding_map_dealloc, /*tp_dealloc*/
3519 0, /*tp_print*/
3520 0, /*tp_getattr*/
3521 0, /*tp_setattr*/
3522 0, /*tp_compare*/
3523 0, /*tp_repr*/
3524 0, /*tp_as_number*/
3525 0, /*tp_as_sequence*/
3526 0, /*tp_as_mapping*/
3527 0, /*tp_hash*/
3528 0, /*tp_call*/
3529 0, /*tp_str*/
3530 0, /*tp_getattro*/
3531 0, /*tp_setattro*/
3532 0, /*tp_as_buffer*/
3533 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3534 0, /*tp_doc*/
3535 0, /*tp_traverse*/
3536 0, /*tp_clear*/
3537 0, /*tp_richcompare*/
3538 0, /*tp_weaklistoffset*/
3539 0, /*tp_iter*/
3540 0, /*tp_iternext*/
3541 encoding_map_methods, /*tp_methods*/
3542 0, /*tp_members*/
3543 0, /*tp_getset*/
3544 0, /*tp_base*/
3545 0, /*tp_dict*/
3546 0, /*tp_descr_get*/
3547 0, /*tp_descr_set*/
3548 0, /*tp_dictoffset*/
3549 0, /*tp_init*/
3550 0, /*tp_alloc*/
3551 0, /*tp_new*/
3552 0, /*tp_free*/
3553 0, /*tp_is_gc*/
3554};
3555
3556PyObject*
3557PyUnicode_BuildEncodingMap(PyObject* string)
3558{
3559 Py_UNICODE *decode;
3560 PyObject *result;
3561 struct encoding_map *mresult;
3562 int i;
3563 int need_dict = 0;
3564 unsigned char level1[32];
3565 unsigned char level2[512];
3566 unsigned char *mlevel1, *mlevel2, *mlevel3;
3567 int count2 = 0, count3 = 0;
3568
3569 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3570 PyErr_BadArgument();
3571 return NULL;
3572 }
3573 decode = PyUnicode_AS_UNICODE(string);
3574 memset(level1, 0xFF, sizeof level1);
3575 memset(level2, 0xFF, sizeof level2);
3576
3577 /* If there isn't a one-to-one mapping of NULL to \0,
3578 or if there are non-BMP characters, we need to use
3579 a mapping dictionary. */
3580 if (decode[0] != 0)
3581 need_dict = 1;
3582 for (i = 1; i < 256; i++) {
3583 int l1, l2;
3584 if (decode[i] == 0
3585 #ifdef Py_UNICODE_WIDE
3586 || decode[i] > 0xFFFF
3587 #endif
3588 ) {
3589 need_dict = 1;
3590 break;
3591 }
3592 if (decode[i] == 0xFFFE)
3593 /* unmapped character */
3594 continue;
3595 l1 = decode[i] >> 11;
3596 l2 = decode[i] >> 7;
3597 if (level1[l1] == 0xFF)
3598 level1[l1] = count2++;
3599 if (level2[l2] == 0xFF)
3600 level2[l2] = count3++;
3601 }
3602
3603 if (count2 >= 0xFF || count3 >= 0xFF)
3604 need_dict = 1;
3605
3606 if (need_dict) {
3607 PyObject *result = PyDict_New();
3608 PyObject *key, *value;
3609 if (!result)
3610 return NULL;
3611 for (i = 0; i < 256; i++) {
3612 key = value = NULL;
3613 key = PyInt_FromLong(decode[i]);
3614 value = PyInt_FromLong(i);
3615 if (!key || !value)
3616 goto failed1;
3617 if (PyDict_SetItem(result, key, value) == -1)
3618 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003619 Py_DECREF(key);
3620 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003621 }
3622 return result;
3623 failed1:
3624 Py_XDECREF(key);
3625 Py_XDECREF(value);
3626 Py_DECREF(result);
3627 return NULL;
3628 }
3629
3630 /* Create a three-level trie */
3631 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3632 16*count2 + 128*count3 - 1);
3633 if (!result)
3634 return PyErr_NoMemory();
3635 PyObject_Init(result, &EncodingMapType);
3636 mresult = (struct encoding_map*)result;
3637 mresult->count2 = count2;
3638 mresult->count3 = count3;
3639 mlevel1 = mresult->level1;
3640 mlevel2 = mresult->level23;
3641 mlevel3 = mresult->level23 + 16*count2;
3642 memcpy(mlevel1, level1, 32);
3643 memset(mlevel2, 0xFF, 16*count2);
3644 memset(mlevel3, 0, 128*count3);
3645 count3 = 0;
3646 for (i = 1; i < 256; i++) {
3647 int o1, o2, o3, i2, i3;
3648 if (decode[i] == 0xFFFE)
3649 /* unmapped character */
3650 continue;
3651 o1 = decode[i]>>11;
3652 o2 = (decode[i]>>7) & 0xF;
3653 i2 = 16*mlevel1[o1] + o2;
3654 if (mlevel2[i2] == 0xFF)
3655 mlevel2[i2] = count3++;
3656 o3 = decode[i] & 0x7F;
3657 i3 = 128*mlevel2[i2] + o3;
3658 mlevel3[i3] = i;
3659 }
3660 return result;
3661}
3662
3663static int
3664encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3665{
3666 struct encoding_map *map = (struct encoding_map*)mapping;
3667 int l1 = c>>11;
3668 int l2 = (c>>7) & 0xF;
3669 int l3 = c & 0x7F;
3670 int i;
3671
3672#ifdef Py_UNICODE_WIDE
3673 if (c > 0xFFFF) {
3674 return -1;
3675 }
3676#endif
3677 if (c == 0)
3678 return 0;
3679 /* level 1*/
3680 i = map->level1[l1];
3681 if (i == 0xFF) {
3682 return -1;
3683 }
3684 /* level 2*/
3685 i = map->level23[16*i+l2];
3686 if (i == 0xFF) {
3687 return -1;
3688 }
3689 /* level 3 */
3690 i = map->level23[16*map->count2 + 128*i + l3];
3691 if (i == 0) {
3692 return -1;
3693 }
3694 return i;
3695}
3696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697/* Lookup the character ch in the mapping. If the character
3698 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003699 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003700static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 PyObject *w = PyInt_FromLong((long)c);
3703 PyObject *x;
3704
3705 if (w == NULL)
3706 return NULL;
3707 x = PyObject_GetItem(mapping, w);
3708 Py_DECREF(w);
3709 if (x == NULL) {
3710 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3711 /* No mapping found means: mapping is undefined. */
3712 PyErr_Clear();
3713 x = Py_None;
3714 Py_INCREF(x);
3715 return x;
3716 } else
3717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003719 else if (x == Py_None)
3720 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 else if (PyInt_Check(x)) {
3722 long value = PyInt_AS_LONG(x);
3723 if (value < 0 || value > 255) {
3724 PyErr_SetString(PyExc_TypeError,
3725 "character mapping must be in range(256)");
3726 Py_DECREF(x);
3727 return NULL;
3728 }
3729 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 else if (PyString_Check(x))
3732 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 /* wrong return value */
3735 PyErr_SetString(PyExc_TypeError,
3736 "character mapping must return integer, None or str");
3737 Py_DECREF(x);
3738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 }
3740}
3741
Martin v. Löwis3f767792006-06-04 19:36:28 +00003742static int
3743charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3744{
3745 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3746 /* exponentially overallocate to minimize reallocations */
3747 if (requiredsize < 2*outsize)
3748 requiredsize = 2*outsize;
3749 if (_PyString_Resize(outobj, requiredsize)) {
3750 return 0;
3751 }
3752 return 1;
3753}
3754
3755typedef enum charmapencode_result {
3756 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3757}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758/* lookup the character, put the result in the output string and adjust
3759 various state variables. Reallocate the output string if not enough
3760 space is available. Return a new reference to the object that
3761 was put in the output buffer, or Py_None, if the mapping was undefined
3762 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003763 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003765charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003766 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003768 PyObject *rep;
3769 char *outstart;
3770 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771
Martin v. Löwis68192102007-07-21 06:55:02 +00003772 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003773 int res = encoding_map_lookup(c, mapping);
3774 Py_ssize_t requiredsize = *outpos+1;
3775 if (res == -1)
3776 return enc_FAILED;
3777 if (outsize<requiredsize)
3778 if (!charmapencode_resize(outobj, outpos, requiredsize))
3779 return enc_EXCEPTION;
3780 outstart = PyString_AS_STRING(*outobj);
3781 outstart[(*outpos)++] = (char)res;
3782 return enc_SUCCESS;
3783 }
3784
3785 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003787 return enc_EXCEPTION;
3788 else if (rep==Py_None) {
3789 Py_DECREF(rep);
3790 return enc_FAILED;
3791 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003793 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003794 if (outsize<requiredsize)
3795 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003797 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003799 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3801 }
3802 else {
3803 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003804 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3805 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003806 if (outsize<requiredsize)
3807 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003809 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003811 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 memcpy(outstart + *outpos, repchars, repsize);
3813 *outpos += repsize;
3814 }
3815 }
Georg Brandl9f167602006-06-04 21:46:16 +00003816 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003817 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818}
3819
3820/* handle an error in PyUnicode_EncodeCharmap
3821 Return 0 on success, -1 on error */
3822static
3823int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003824 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003826 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003827 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828{
3829 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003830 Py_ssize_t repsize;
3831 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003832 Py_UNICODE *uni2;
3833 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003834 Py_ssize_t collstartpos = *inpos;
3835 Py_ssize_t collendpos = *inpos+1;
3836 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003837 char *encoding = "charmap";
3838 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003839 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 /* find all unencodable characters */
3842 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003843 PyObject *rep;
Martin v. Löwis68192102007-07-21 06:55:02 +00003844 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003845 int res = encoding_map_lookup(p[collendpos], mapping);
3846 if (res != -1)
3847 break;
3848 ++collendpos;
3849 continue;
3850 }
3851
3852 rep = charmapencode_lookup(p[collendpos], mapping);
3853 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003854 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003855 else if (rep!=Py_None) {
3856 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 break;
3858 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003859 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 ++collendpos;
3861 }
3862 /* cache callback name lookup
3863 * (if not done yet, i.e. it's the first error) */
3864 if (*known_errorHandler==-1) {
3865 if ((errors==NULL) || (!strcmp(errors, "strict")))
3866 *known_errorHandler = 1;
3867 else if (!strcmp(errors, "replace"))
3868 *known_errorHandler = 2;
3869 else if (!strcmp(errors, "ignore"))
3870 *known_errorHandler = 3;
3871 else if (!strcmp(errors, "xmlcharrefreplace"))
3872 *known_errorHandler = 4;
3873 else
3874 *known_errorHandler = 0;
3875 }
3876 switch (*known_errorHandler) {
3877 case 1: /* strict */
3878 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3879 return -1;
3880 case 2: /* replace */
3881 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3882 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003883 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 return -1;
3885 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003886 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003887 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3888 return -1;
3889 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 }
3891 /* fall through */
3892 case 3: /* ignore */
3893 *inpos = collendpos;
3894 break;
3895 case 4: /* xmlcharrefreplace */
3896 /* generate replacement (temporarily (mis)uses p) */
3897 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3898 char buffer[2+29+1+1];
3899 char *cp;
3900 sprintf(buffer, "&#%d;", (int)p[collpos]);
3901 for (cp = buffer; *cp; ++cp) {
3902 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003903 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003904 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003905 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003906 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3907 return -1;
3908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909 }
3910 }
3911 *inpos = collendpos;
3912 break;
3913 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003914 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 encoding, reason, p, size, exceptionObject,
3916 collstartpos, collendpos, &newpos);
3917 if (repunicode == NULL)
3918 return -1;
3919 /* generate replacement */
3920 repsize = PyUnicode_GET_SIZE(repunicode);
3921 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3922 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003923 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 return -1;
3925 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003926 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003928 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3929 return -1;
3930 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 }
3932 *inpos = newpos;
3933 Py_DECREF(repunicode);
3934 }
3935 return 0;
3936}
3937
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003939 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 PyObject *mapping,
3941 const char *errors)
3942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 /* output object */
3944 PyObject *res = NULL;
3945 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003946 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003948 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949 PyObject *errorHandler = NULL;
3950 PyObject *exc = NULL;
3951 /* the following variable is used for caching string comparisons
3952 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3953 * 3=ignore, 4=xmlcharrefreplace */
3954 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955
3956 /* Default to Latin-1 */
3957 if (mapping == NULL)
3958 return PyUnicode_EncodeLatin1(p, size, errors);
3959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 /* allocate enough for a simple encoding without
3961 replacements, if we need more, we'll resize */
3962 res = PyString_FromStringAndSize(NULL, size);
3963 if (res == NULL)
3964 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003965 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 while (inpos<size) {
3969 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003970 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3971 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003973 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 if (charmap_encoding_error(p, size, &inpos, mapping,
3975 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003976 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003977 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003978 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 else
3982 /* done with this character => adjust input position */
3983 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 /* Resize if we allocated to much */
3987 if (respos<PyString_GET_SIZE(res)) {
3988 if (_PyString_Resize(&res, respos))
3989 goto onError;
3990 }
3991 Py_XDECREF(exc);
3992 Py_XDECREF(errorHandler);
3993 return res;
3994
3995 onError:
3996 Py_XDECREF(res);
3997 Py_XDECREF(exc);
3998 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 return NULL;
4000}
4001
4002PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4003 PyObject *mapping)
4004{
4005 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4006 PyErr_BadArgument();
4007 return NULL;
4008 }
4009 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4010 PyUnicode_GET_SIZE(unicode),
4011 mapping,
4012 NULL);
4013}
4014
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015/* create or adjust a UnicodeTranslateError */
4016static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004017 const Py_UNICODE *unicode, Py_ssize_t size,
4018 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004019 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 if (*exceptionObject == NULL) {
4022 *exceptionObject = PyUnicodeTranslateError_Create(
4023 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 }
4025 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4027 goto onError;
4028 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4029 goto onError;
4030 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4031 goto onError;
4032 return;
4033 onError:
4034 Py_DECREF(*exceptionObject);
4035 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 }
4037}
4038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039/* raises a UnicodeTranslateError */
4040static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004041 const Py_UNICODE *unicode, Py_ssize_t size,
4042 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 const char *reason)
4044{
4045 make_translate_exception(exceptionObject,
4046 unicode, size, startpos, endpos, reason);
4047 if (*exceptionObject != NULL)
4048 PyCodec_StrictErrors(*exceptionObject);
4049}
4050
4051/* error handling callback helper:
4052 build arguments, call the callback and check the arguments,
4053 put the result into newpos and return the replacement string, which
4054 has to be freed by the caller */
4055static PyObject *unicode_translate_call_errorhandler(const char *errors,
4056 PyObject **errorHandler,
4057 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004058 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4059 Py_ssize_t startpos, Py_ssize_t endpos,
4060 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004062 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063
Martin v. Löwis412fb672006-04-13 06:34:32 +00004064 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 PyObject *restuple;
4066 PyObject *resunicode;
4067
4068 if (*errorHandler == NULL) {
4069 *errorHandler = PyCodec_LookupError(errors);
4070 if (*errorHandler == NULL)
4071 return NULL;
4072 }
4073
4074 make_translate_exception(exceptionObject,
4075 unicode, size, startpos, endpos, reason);
4076 if (*exceptionObject == NULL)
4077 return NULL;
4078
4079 restuple = PyObject_CallFunctionObjArgs(
4080 *errorHandler, *exceptionObject, NULL);
4081 if (restuple == NULL)
4082 return NULL;
4083 if (!PyTuple_Check(restuple)) {
4084 PyErr_Format(PyExc_TypeError, &argparse[4]);
4085 Py_DECREF(restuple);
4086 return NULL;
4087 }
4088 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004089 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 Py_DECREF(restuple);
4091 return NULL;
4092 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004093 if (i_newpos<0)
4094 *newpos = size+i_newpos;
4095 else
4096 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004097 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004098 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004099 Py_DECREF(restuple);
4100 return NULL;
4101 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 Py_INCREF(resunicode);
4103 Py_DECREF(restuple);
4104 return resunicode;
4105}
4106
4107/* Lookup the character ch in the mapping and put the result in result,
4108 which must be decrefed by the caller.
4109 Return 0 on success, -1 on error */
4110static
4111int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4112{
4113 PyObject *w = PyInt_FromLong((long)c);
4114 PyObject *x;
4115
4116 if (w == NULL)
4117 return -1;
4118 x = PyObject_GetItem(mapping, w);
4119 Py_DECREF(w);
4120 if (x == NULL) {
4121 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4122 /* No mapping found means: use 1:1 mapping. */
4123 PyErr_Clear();
4124 *result = NULL;
4125 return 0;
4126 } else
4127 return -1;
4128 }
4129 else if (x == Py_None) {
4130 *result = x;
4131 return 0;
4132 }
4133 else if (PyInt_Check(x)) {
4134 long value = PyInt_AS_LONG(x);
4135 long max = PyUnicode_GetMax();
4136 if (value < 0 || value > max) {
4137 PyErr_Format(PyExc_TypeError,
4138 "character mapping must be in range(0x%lx)", max+1);
4139 Py_DECREF(x);
4140 return -1;
4141 }
4142 *result = x;
4143 return 0;
4144 }
4145 else if (PyUnicode_Check(x)) {
4146 *result = x;
4147 return 0;
4148 }
4149 else {
4150 /* wrong return value */
4151 PyErr_SetString(PyExc_TypeError,
4152 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004153 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 return -1;
4155 }
4156}
4157/* ensure that *outobj is at least requiredsize characters long,
4158if not reallocate and adjust various state variables.
4159Return 0 on success, -1 on error */
4160static
Walter Dörwald4894c302003-10-24 14:25:28 +00004161int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004164 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004165 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004169 if (requiredsize < 2 * oldsize)
4170 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004171 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 return -1;
4173 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 }
4175 return 0;
4176}
4177/* lookup the character, put the result in the output string and adjust
4178 various state variables. Return a new reference to the object that
4179 was put in the output buffer in *result, or Py_None, if the mapping was
4180 undefined (in which case no character was written).
4181 The called must decref result.
4182 Return 0 on success, -1 on error. */
4183static
Walter Dörwald4894c302003-10-24 14:25:28 +00004184int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004185 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004186 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187{
Walter Dörwald4894c302003-10-24 14:25:28 +00004188 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 return -1;
4190 if (*res==NULL) {
4191 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004192 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 }
4194 else if (*res==Py_None)
4195 ;
4196 else if (PyInt_Check(*res)) {
4197 /* no overflow check, because we know that the space is enough */
4198 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4199 }
4200 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004201 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 if (repsize==1) {
4203 /* no overflow check, because we know that the space is enough */
4204 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4205 }
4206 else if (repsize!=0) {
4207 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004208 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004209 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004210 repsize - 1;
4211 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 return -1;
4213 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4214 *outp += repsize;
4215 }
4216 }
4217 else
4218 return -1;
4219 return 0;
4220}
4221
4222PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004223 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 PyObject *mapping,
4225 const char *errors)
4226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 /* output object */
4228 PyObject *res = NULL;
4229 /* pointers to the beginning and end+1 of input */
4230 const Py_UNICODE *startp = p;
4231 const Py_UNICODE *endp = p + size;
4232 /* pointer into the output */
4233 Py_UNICODE *str;
4234 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004235 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 char *reason = "character maps to <undefined>";
4237 PyObject *errorHandler = NULL;
4238 PyObject *exc = NULL;
4239 /* the following variable is used for caching string comparisons
4240 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4241 * 3=ignore, 4=xmlcharrefreplace */
4242 int known_errorHandler = -1;
4243
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244 if (mapping == NULL) {
4245 PyErr_BadArgument();
4246 return NULL;
4247 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248
4249 /* allocate enough for a simple 1:1 translation without
4250 replacements, if we need more, we'll resize */
4251 res = PyUnicode_FromUnicode(NULL, size);
4252 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255 return res;
4256 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 while (p<endp) {
4259 /* try to encode it */
4260 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004261 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 goto onError;
4264 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004265 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004266 if (x!=Py_None) /* it worked => adjust input pointer */
4267 ++p;
4268 else { /* untranslatable character */
4269 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004270 Py_ssize_t repsize;
4271 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 Py_UNICODE *uni2;
4273 /* startpos for collecting untranslatable chars */
4274 const Py_UNICODE *collstart = p;
4275 const Py_UNICODE *collend = p+1;
4276 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 /* find all untranslatable characters */
4279 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004280 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 goto onError;
4282 Py_XDECREF(x);
4283 if (x!=Py_None)
4284 break;
4285 ++collend;
4286 }
4287 /* cache callback name lookup
4288 * (if not done yet, i.e. it's the first error) */
4289 if (known_errorHandler==-1) {
4290 if ((errors==NULL) || (!strcmp(errors, "strict")))
4291 known_errorHandler = 1;
4292 else if (!strcmp(errors, "replace"))
4293 known_errorHandler = 2;
4294 else if (!strcmp(errors, "ignore"))
4295 known_errorHandler = 3;
4296 else if (!strcmp(errors, "xmlcharrefreplace"))
4297 known_errorHandler = 4;
4298 else
4299 known_errorHandler = 0;
4300 }
4301 switch (known_errorHandler) {
4302 case 1: /* strict */
4303 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4304 goto onError;
4305 case 2: /* replace */
4306 /* No need to check for space, this is a 1:1 replacement */
4307 for (coll = collstart; coll<collend; ++coll)
4308 *str++ = '?';
4309 /* fall through */
4310 case 3: /* ignore */
4311 p = collend;
4312 break;
4313 case 4: /* xmlcharrefreplace */
4314 /* generate replacement (temporarily (mis)uses p) */
4315 for (p = collstart; p < collend; ++p) {
4316 char buffer[2+29+1+1];
4317 char *cp;
4318 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004319 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4321 goto onError;
4322 for (cp = buffer; *cp; ++cp)
4323 *str++ = *cp;
4324 }
4325 p = collend;
4326 break;
4327 default:
4328 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4329 reason, startp, size, &exc,
4330 collstart-startp, collend-startp, &newpos);
4331 if (repunicode == NULL)
4332 goto onError;
4333 /* generate replacement */
4334 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004335 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4337 Py_DECREF(repunicode);
4338 goto onError;
4339 }
4340 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4341 *str++ = *uni2;
4342 p = startp + newpos;
4343 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 }
4345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 /* Resize if we allocated to much */
4348 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004349 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004350 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004351 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 }
4353 Py_XDECREF(exc);
4354 Py_XDECREF(errorHandler);
4355 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357 onError:
4358 Py_XDECREF(res);
4359 Py_XDECREF(exc);
4360 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 return NULL;
4362}
4363
4364PyObject *PyUnicode_Translate(PyObject *str,
4365 PyObject *mapping,
4366 const char *errors)
4367{
4368 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004369
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 str = PyUnicode_FromObject(str);
4371 if (str == NULL)
4372 goto onError;
4373 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4374 PyUnicode_GET_SIZE(str),
4375 mapping,
4376 errors);
4377 Py_DECREF(str);
4378 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004379
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 onError:
4381 Py_XDECREF(str);
4382 return NULL;
4383}
Tim Petersced69f82003-09-16 20:30:58 +00004384
Guido van Rossum9e896b32000-04-05 20:11:21 +00004385/* --- Decimal Encoder ---------------------------------------------------- */
4386
4387int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004388 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004389 char *output,
4390 const char *errors)
4391{
4392 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 PyObject *errorHandler = NULL;
4394 PyObject *exc = NULL;
4395 const char *encoding = "decimal";
4396 const char *reason = "invalid decimal Unicode string";
4397 /* the following variable is used for caching string comparisons
4398 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4399 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004400
4401 if (output == NULL) {
4402 PyErr_BadArgument();
4403 return -1;
4404 }
4405
4406 p = s;
4407 end = s + length;
4408 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004410 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004412 Py_ssize_t repsize;
4413 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 Py_UNICODE *uni2;
4415 Py_UNICODE *collstart;
4416 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004417
Guido van Rossum9e896b32000-04-05 20:11:21 +00004418 if (Py_UNICODE_ISSPACE(ch)) {
4419 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004421 continue;
4422 }
4423 decimal = Py_UNICODE_TODECIMAL(ch);
4424 if (decimal >= 0) {
4425 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004427 continue;
4428 }
Guido van Rossumba477042000-04-06 18:18:10 +00004429 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004430 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004432 continue;
4433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 /* All other characters are considered unencodable */
4435 collstart = p;
4436 collend = p+1;
4437 while (collend < end) {
4438 if ((0 < *collend && *collend < 256) ||
4439 !Py_UNICODE_ISSPACE(*collend) ||
4440 Py_UNICODE_TODECIMAL(*collend))
4441 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 /* cache callback name lookup
4444 * (if not done yet, i.e. it's the first error) */
4445 if (known_errorHandler==-1) {
4446 if ((errors==NULL) || (!strcmp(errors, "strict")))
4447 known_errorHandler = 1;
4448 else if (!strcmp(errors, "replace"))
4449 known_errorHandler = 2;
4450 else if (!strcmp(errors, "ignore"))
4451 known_errorHandler = 3;
4452 else if (!strcmp(errors, "xmlcharrefreplace"))
4453 known_errorHandler = 4;
4454 else
4455 known_errorHandler = 0;
4456 }
4457 switch (known_errorHandler) {
4458 case 1: /* strict */
4459 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4460 goto onError;
4461 case 2: /* replace */
4462 for (p = collstart; p < collend; ++p)
4463 *output++ = '?';
4464 /* fall through */
4465 case 3: /* ignore */
4466 p = collend;
4467 break;
4468 case 4: /* xmlcharrefreplace */
4469 /* generate replacement (temporarily (mis)uses p) */
4470 for (p = collstart; p < collend; ++p)
4471 output += sprintf(output, "&#%d;", (int)*p);
4472 p = collend;
4473 break;
4474 default:
4475 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4476 encoding, reason, s, length, &exc,
4477 collstart-s, collend-s, &newpos);
4478 if (repunicode == NULL)
4479 goto onError;
4480 /* generate replacement */
4481 repsize = PyUnicode_GET_SIZE(repunicode);
4482 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4483 Py_UNICODE ch = *uni2;
4484 if (Py_UNICODE_ISSPACE(ch))
4485 *output++ = ' ';
4486 else {
4487 decimal = Py_UNICODE_TODECIMAL(ch);
4488 if (decimal >= 0)
4489 *output++ = '0' + decimal;
4490 else if (0 < ch && ch < 256)
4491 *output++ = (char)ch;
4492 else {
4493 Py_DECREF(repunicode);
4494 raise_encode_exception(&exc, encoding,
4495 s, length, collstart-s, collend-s, reason);
4496 goto onError;
4497 }
4498 }
4499 }
4500 p = s + newpos;
4501 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004502 }
4503 }
4504 /* 0-terminate the output string */
4505 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 Py_XDECREF(exc);
4507 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004508 return 0;
4509
4510 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 Py_XDECREF(exc);
4512 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004513 return -1;
4514}
4515
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516/* --- Helpers ------------------------------------------------------------ */
4517
Fredrik Lundha50d2012006-05-26 17:04:58 +00004518#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004519
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004520#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004521#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004522#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004523
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004524Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004525STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4526{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004527 if (str[0] != other[0])
4528 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004529 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4530}
4531
Fredrik Lundhb9479482006-05-26 17:22:38 +00004532#define STRINGLIB_EMPTY unicode_empty
4533
Fredrik Lundha50d2012006-05-26 17:04:58 +00004534#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004535
4536#include "stringlib/count.h"
4537#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004538#include "stringlib/partition.h"
4539
Fredrik Lundhc8162812006-05-26 19:33:03 +00004540/* helper macro to fixup start/end slice values */
4541#define FIX_START_END(obj) \
4542 if (start < 0) \
4543 start += (obj)->length; \
4544 if (start < 0) \
4545 start = 0; \
4546 if (end > (obj)->length) \
4547 end = (obj)->length; \
4548 if (end < 0) \
4549 end += (obj)->length; \
4550 if (end < 0) \
4551 end = 0;
4552
Martin v. Löwis18e16552006-02-15 17:27:45 +00004553Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004554 PyObject *substr,
4555 Py_ssize_t start,
4556 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004558 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004559 PyUnicodeObject* str_obj;
4560 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004561
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004562 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4563 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004565 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4566 if (!sub_obj) {
4567 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568 return -1;
4569 }
Tim Petersced69f82003-09-16 20:30:58 +00004570
Fredrik Lundhc8162812006-05-26 19:33:03 +00004571 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004572
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004573 result = stringlib_count(
4574 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4575 );
4576
4577 Py_DECREF(sub_obj);
4578 Py_DECREF(str_obj);
4579
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580 return result;
4581}
4582
Martin v. Löwis18e16552006-02-15 17:27:45 +00004583Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004584 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004585 Py_ssize_t start,
4586 Py_ssize_t end,
4587 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004589 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004590
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004591 str = PyUnicode_FromObject(str);
4592 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004593 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004594 sub = PyUnicode_FromObject(sub);
4595 if (!sub) {
4596 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004597 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 }
Tim Petersced69f82003-09-16 20:30:58 +00004599
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004600 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004601 result = stringlib_find_slice(
4602 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4603 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4604 start, end
4605 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004606 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004607 result = stringlib_rfind_slice(
4608 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4609 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4610 start, end
4611 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004612
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004613 Py_DECREF(str);
4614 Py_DECREF(sub);
4615
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616 return result;
4617}
4618
Tim Petersced69f82003-09-16 20:30:58 +00004619static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620int tailmatch(PyUnicodeObject *self,
4621 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004622 Py_ssize_t start,
4623 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624 int direction)
4625{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 if (substring->length == 0)
4627 return 1;
4628
Fredrik Lundhc8162812006-05-26 19:33:03 +00004629 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
4631 end -= substring->length;
4632 if (end < start)
4633 return 0;
4634
4635 if (direction > 0) {
4636 if (Py_UNICODE_MATCH(self, end, substring))
4637 return 1;
4638 } else {
4639 if (Py_UNICODE_MATCH(self, start, substring))
4640 return 1;
4641 }
4642
4643 return 0;
4644}
4645
Martin v. Löwis18e16552006-02-15 17:27:45 +00004646Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004648 Py_ssize_t start,
4649 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 int direction)
4651{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004652 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004653
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654 str = PyUnicode_FromObject(str);
4655 if (str == NULL)
4656 return -1;
4657 substr = PyUnicode_FromObject(substr);
4658 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004659 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660 return -1;
4661 }
Tim Petersced69f82003-09-16 20:30:58 +00004662
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 result = tailmatch((PyUnicodeObject *)str,
4664 (PyUnicodeObject *)substr,
4665 start, end, direction);
4666 Py_DECREF(str);
4667 Py_DECREF(substr);
4668 return result;
4669}
4670
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671/* Apply fixfct filter to the Unicode object self and return a
4672 reference to the modified object */
4673
Tim Petersced69f82003-09-16 20:30:58 +00004674static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675PyObject *fixup(PyUnicodeObject *self,
4676 int (*fixfct)(PyUnicodeObject *s))
4677{
4678
4679 PyUnicodeObject *u;
4680
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004681 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 if (u == NULL)
4683 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004684
4685 Py_UNICODE_COPY(u->str, self->str, self->length);
4686
Tim Peters7a29bd52001-09-12 03:03:31 +00004687 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 /* fixfct should return TRUE if it modified the buffer. If
4689 FALSE, return a reference to the original buffer instead
4690 (to save space, not time) */
4691 Py_INCREF(self);
4692 Py_DECREF(u);
4693 return (PyObject*) self;
4694 }
4695 return (PyObject*) u;
4696}
4697
Tim Petersced69f82003-09-16 20:30:58 +00004698static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699int fixupper(PyUnicodeObject *self)
4700{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004701 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 Py_UNICODE *s = self->str;
4703 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004704
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 while (len-- > 0) {
4706 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004707
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 ch = Py_UNICODE_TOUPPER(*s);
4709 if (ch != *s) {
4710 status = 1;
4711 *s = ch;
4712 }
4713 s++;
4714 }
4715
4716 return status;
4717}
4718
Tim Petersced69f82003-09-16 20:30:58 +00004719static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720int fixlower(PyUnicodeObject *self)
4721{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004722 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723 Py_UNICODE *s = self->str;
4724 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004725
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 while (len-- > 0) {
4727 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 ch = Py_UNICODE_TOLOWER(*s);
4730 if (ch != *s) {
4731 status = 1;
4732 *s = ch;
4733 }
4734 s++;
4735 }
4736
4737 return status;
4738}
4739
Tim Petersced69f82003-09-16 20:30:58 +00004740static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741int fixswapcase(PyUnicodeObject *self)
4742{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004743 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 Py_UNICODE *s = self->str;
4745 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004746
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 while (len-- > 0) {
4748 if (Py_UNICODE_ISUPPER(*s)) {
4749 *s = Py_UNICODE_TOLOWER(*s);
4750 status = 1;
4751 } else if (Py_UNICODE_ISLOWER(*s)) {
4752 *s = Py_UNICODE_TOUPPER(*s);
4753 status = 1;
4754 }
4755 s++;
4756 }
4757
4758 return status;
4759}
4760
Tim Petersced69f82003-09-16 20:30:58 +00004761static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762int fixcapitalize(PyUnicodeObject *self)
4763{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004764 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004765 Py_UNICODE *s = self->str;
4766 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004767
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004768 if (len == 0)
4769 return 0;
4770 if (Py_UNICODE_ISLOWER(*s)) {
4771 *s = Py_UNICODE_TOUPPER(*s);
4772 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004774 s++;
4775 while (--len > 0) {
4776 if (Py_UNICODE_ISUPPER(*s)) {
4777 *s = Py_UNICODE_TOLOWER(*s);
4778 status = 1;
4779 }
4780 s++;
4781 }
4782 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783}
4784
4785static
4786int fixtitle(PyUnicodeObject *self)
4787{
4788 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4789 register Py_UNICODE *e;
4790 int previous_is_cased;
4791
4792 /* Shortcut for single character strings */
4793 if (PyUnicode_GET_SIZE(self) == 1) {
4794 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4795 if (*p != ch) {
4796 *p = ch;
4797 return 1;
4798 }
4799 else
4800 return 0;
4801 }
Tim Petersced69f82003-09-16 20:30:58 +00004802
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 e = p + PyUnicode_GET_SIZE(self);
4804 previous_is_cased = 0;
4805 for (; p < e; p++) {
4806 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 if (previous_is_cased)
4809 *p = Py_UNICODE_TOLOWER(ch);
4810 else
4811 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004812
4813 if (Py_UNICODE_ISLOWER(ch) ||
4814 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 Py_UNICODE_ISTITLE(ch))
4816 previous_is_cased = 1;
4817 else
4818 previous_is_cased = 0;
4819 }
4820 return 1;
4821}
4822
Tim Peters8ce9f162004-08-27 01:49:32 +00004823PyObject *
4824PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825{
Tim Peters8ce9f162004-08-27 01:49:32 +00004826 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004827 const Py_UNICODE blank = ' ';
4828 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004829 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004830 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004831 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4832 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004833 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4834 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004835 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004836 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004837 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838
Tim Peters05eba1f2004-08-27 21:32:02 +00004839 fseq = PySequence_Fast(seq, "");
4840 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004841 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004842 }
4843
Tim Peters91879ab2004-08-27 22:35:44 +00004844 /* Grrrr. A codec may be invoked to convert str objects to
4845 * Unicode, and so it's possible to call back into Python code
4846 * during PyUnicode_FromObject(), and so it's possible for a sick
4847 * codec to change the size of fseq (if seq is a list). Therefore
4848 * we have to keep refetching the size -- can't assume seqlen
4849 * is invariant.
4850 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004851 seqlen = PySequence_Fast_GET_SIZE(fseq);
4852 /* If empty sequence, return u"". */
4853 if (seqlen == 0) {
4854 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4855 goto Done;
4856 }
4857 /* If singleton sequence with an exact Unicode, return that. */
4858 if (seqlen == 1) {
4859 item = PySequence_Fast_GET_ITEM(fseq, 0);
4860 if (PyUnicode_CheckExact(item)) {
4861 Py_INCREF(item);
4862 res = (PyUnicodeObject *)item;
4863 goto Done;
4864 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004865 }
4866
Tim Peters05eba1f2004-08-27 21:32:02 +00004867 /* At least two items to join, or one that isn't exact Unicode. */
4868 if (seqlen > 1) {
4869 /* Set up sep and seplen -- they're needed. */
4870 if (separator == NULL) {
4871 sep = &blank;
4872 seplen = 1;
4873 }
4874 else {
4875 internal_separator = PyUnicode_FromObject(separator);
4876 if (internal_separator == NULL)
4877 goto onError;
4878 sep = PyUnicode_AS_UNICODE(internal_separator);
4879 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004880 /* In case PyUnicode_FromObject() mutated seq. */
4881 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004882 }
4883 }
4884
4885 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004886 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004887 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004888 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004889 res_p = PyUnicode_AS_UNICODE(res);
4890 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004891
Tim Peters05eba1f2004-08-27 21:32:02 +00004892 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004893 Py_ssize_t itemlen;
4894 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004895
4896 item = PySequence_Fast_GET_ITEM(fseq, i);
4897 /* Convert item to Unicode. */
4898 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4899 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004900 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004901 " %.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +00004902 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004903 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004904 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004905 item = PyUnicode_FromObject(item);
4906 if (item == NULL)
4907 goto onError;
4908 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004909
Tim Peters91879ab2004-08-27 22:35:44 +00004910 /* In case PyUnicode_FromObject() mutated seq. */
4911 seqlen = PySequence_Fast_GET_SIZE(fseq);
4912
Tim Peters8ce9f162004-08-27 01:49:32 +00004913 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004915 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004916 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004917 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004918 if (i < seqlen - 1) {
4919 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004920 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004921 goto Overflow;
4922 }
4923 if (new_res_used > res_alloc) {
4924 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004925 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004926 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004927 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004928 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004929 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004930 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004931 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004933 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004934 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004936
4937 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004938 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004939 res_p += itemlen;
4940 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004941 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004942 res_p += seplen;
4943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004945 res_used = new_res_used;
4946 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004947
Tim Peters05eba1f2004-08-27 21:32:02 +00004948 /* Shrink res to match the used area; this probably can't fail,
4949 * but it's cheap to check.
4950 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004951 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004952 goto onError;
4953
4954 Done:
4955 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004956 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 return (PyObject *)res;
4958
Tim Peters8ce9f162004-08-27 01:49:32 +00004959 Overflow:
4960 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004961 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004962 Py_DECREF(item);
4963 /* fall through */
4964
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004966 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004967 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004968 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969 return NULL;
4970}
4971
Tim Petersced69f82003-09-16 20:30:58 +00004972static
4973PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004974 Py_ssize_t left,
4975 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 Py_UNICODE fill)
4977{
4978 PyUnicodeObject *u;
4979
4980 if (left < 0)
4981 left = 0;
4982 if (right < 0)
4983 right = 0;
4984
Tim Peters7a29bd52001-09-12 03:03:31 +00004985 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 Py_INCREF(self);
4987 return self;
4988 }
4989
4990 u = _PyUnicode_New(left + self->length + right);
4991 if (u) {
4992 if (left)
4993 Py_UNICODE_FILL(u->str, fill, left);
4994 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4995 if (right)
4996 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4997 }
4998
4999 return u;
5000}
5001
5002#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005003 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 if (!str) \
5005 goto onError; \
5006 if (PyList_Append(list, str)) { \
5007 Py_DECREF(str); \
5008 goto onError; \
5009 } \
5010 else \
5011 Py_DECREF(str);
5012
5013static
5014PyObject *split_whitespace(PyUnicodeObject *self,
5015 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005016 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005018 register Py_ssize_t i;
5019 register Py_ssize_t j;
5020 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 PyObject *str;
5022
5023 for (i = j = 0; i < len; ) {
5024 /* find a token */
5025 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5026 i++;
5027 j = i;
5028 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5029 i++;
5030 if (j < i) {
5031 if (maxcount-- <= 0)
5032 break;
5033 SPLIT_APPEND(self->str, j, i);
5034 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5035 i++;
5036 j = i;
5037 }
5038 }
5039 if (j < len) {
5040 SPLIT_APPEND(self->str, j, len);
5041 }
5042 return list;
5043
5044 onError:
5045 Py_DECREF(list);
5046 return NULL;
5047}
5048
5049PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005050 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005052 register Py_ssize_t i;
5053 register Py_ssize_t j;
5054 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 PyObject *list;
5056 PyObject *str;
5057 Py_UNICODE *data;
5058
5059 string = PyUnicode_FromObject(string);
5060 if (string == NULL)
5061 return NULL;
5062 data = PyUnicode_AS_UNICODE(string);
5063 len = PyUnicode_GET_SIZE(string);
5064
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 list = PyList_New(0);
5066 if (!list)
5067 goto onError;
5068
5069 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005070 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005071
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005073 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075
5076 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005077 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 if (i < len) {
5079 if (data[i] == '\r' && i + 1 < len &&
5080 data[i+1] == '\n')
5081 i += 2;
5082 else
5083 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005084 if (keepends)
5085 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 }
Guido van Rossum86662912000-04-11 15:38:46 +00005087 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 j = i;
5089 }
5090 if (j < len) {
5091 SPLIT_APPEND(data, j, len);
5092 }
5093
5094 Py_DECREF(string);
5095 return list;
5096
5097 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005098 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 Py_DECREF(string);
5100 return NULL;
5101}
5102
Tim Petersced69f82003-09-16 20:30:58 +00005103static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104PyObject *split_char(PyUnicodeObject *self,
5105 PyObject *list,
5106 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005107 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005109 register Py_ssize_t i;
5110 register Py_ssize_t j;
5111 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 PyObject *str;
5113
5114 for (i = j = 0; i < len; ) {
5115 if (self->str[i] == ch) {
5116 if (maxcount-- <= 0)
5117 break;
5118 SPLIT_APPEND(self->str, j, i);
5119 i = j = i + 1;
5120 } else
5121 i++;
5122 }
5123 if (j <= len) {
5124 SPLIT_APPEND(self->str, j, len);
5125 }
5126 return list;
5127
5128 onError:
5129 Py_DECREF(list);
5130 return NULL;
5131}
5132
Tim Petersced69f82003-09-16 20:30:58 +00005133static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134PyObject *split_substring(PyUnicodeObject *self,
5135 PyObject *list,
5136 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005139 register Py_ssize_t i;
5140 register Py_ssize_t j;
5141 Py_ssize_t len = self->length;
5142 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 PyObject *str;
5144
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005145 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 if (Py_UNICODE_MATCH(self, i, substring)) {
5147 if (maxcount-- <= 0)
5148 break;
5149 SPLIT_APPEND(self->str, j, i);
5150 i = j = i + sublen;
5151 } else
5152 i++;
5153 }
5154 if (j <= len) {
5155 SPLIT_APPEND(self->str, j, len);
5156 }
5157 return list;
5158
5159 onError:
5160 Py_DECREF(list);
5161 return NULL;
5162}
5163
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005164static
5165PyObject *rsplit_whitespace(PyUnicodeObject *self,
5166 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005168{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005169 register Py_ssize_t i;
5170 register Py_ssize_t j;
5171 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005172 PyObject *str;
5173
5174 for (i = j = len - 1; i >= 0; ) {
5175 /* find a token */
5176 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5177 i--;
5178 j = i;
5179 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5180 i--;
5181 if (j > i) {
5182 if (maxcount-- <= 0)
5183 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005184 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005185 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5186 i--;
5187 j = i;
5188 }
5189 }
5190 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005191 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005192 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005193 if (PyList_Reverse(list) < 0)
5194 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005195 return list;
5196
5197 onError:
5198 Py_DECREF(list);
5199 return NULL;
5200}
5201
5202static
5203PyObject *rsplit_char(PyUnicodeObject *self,
5204 PyObject *list,
5205 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005208 register Py_ssize_t i;
5209 register Py_ssize_t j;
5210 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005211 PyObject *str;
5212
5213 for (i = j = len - 1; i >= 0; ) {
5214 if (self->str[i] == ch) {
5215 if (maxcount-- <= 0)
5216 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005217 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005218 j = i = i - 1;
5219 } else
5220 i--;
5221 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005222 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005223 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005224 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005225 if (PyList_Reverse(list) < 0)
5226 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005227 return list;
5228
5229 onError:
5230 Py_DECREF(list);
5231 return NULL;
5232}
5233
5234static
5235PyObject *rsplit_substring(PyUnicodeObject *self,
5236 PyObject *list,
5237 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005240 register Py_ssize_t i;
5241 register Py_ssize_t j;
5242 Py_ssize_t len = self->length;
5243 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005244 PyObject *str;
5245
5246 for (i = len - sublen, j = len; i >= 0; ) {
5247 if (Py_UNICODE_MATCH(self, i, substring)) {
5248 if (maxcount-- <= 0)
5249 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005250 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005251 j = i;
5252 i -= sublen;
5253 } else
5254 i--;
5255 }
5256 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005257 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005258 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005259 if (PyList_Reverse(list) < 0)
5260 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005261 return list;
5262
5263 onError:
5264 Py_DECREF(list);
5265 return NULL;
5266}
5267
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268#undef SPLIT_APPEND
5269
5270static
5271PyObject *split(PyUnicodeObject *self,
5272 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005273 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274{
5275 PyObject *list;
5276
5277 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005278 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
5280 list = PyList_New(0);
5281 if (!list)
5282 return NULL;
5283
5284 if (substring == NULL)
5285 return split_whitespace(self,list,maxcount);
5286
5287 else if (substring->length == 1)
5288 return split_char(self,list,substring->str[0],maxcount);
5289
5290 else if (substring->length == 0) {
5291 Py_DECREF(list);
5292 PyErr_SetString(PyExc_ValueError, "empty separator");
5293 return NULL;
5294 }
5295 else
5296 return split_substring(self,list,substring,maxcount);
5297}
5298
Tim Petersced69f82003-09-16 20:30:58 +00005299static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005300PyObject *rsplit(PyUnicodeObject *self,
5301 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005302 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005303{
5304 PyObject *list;
5305
5306 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005307 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005308
5309 list = PyList_New(0);
5310 if (!list)
5311 return NULL;
5312
5313 if (substring == NULL)
5314 return rsplit_whitespace(self,list,maxcount);
5315
5316 else if (substring->length == 1)
5317 return rsplit_char(self,list,substring->str[0],maxcount);
5318
5319 else if (substring->length == 0) {
5320 Py_DECREF(list);
5321 PyErr_SetString(PyExc_ValueError, "empty separator");
5322 return NULL;
5323 }
5324 else
5325 return rsplit_substring(self,list,substring,maxcount);
5326}
5327
5328static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329PyObject *replace(PyUnicodeObject *self,
5330 PyUnicodeObject *str1,
5331 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005332 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333{
5334 PyUnicodeObject *u;
5335
5336 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005337 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338
Fredrik Lundh347ee272006-05-24 16:35:18 +00005339 if (str1->length == str2->length) {
5340 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005341 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005342 if (str1->length == 1) {
5343 /* replace characters */
5344 Py_UNICODE u1, u2;
5345 if (!findchar(self->str, self->length, str1->str[0]))
5346 goto nothing;
5347 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5348 if (!u)
5349 return NULL;
5350 Py_UNICODE_COPY(u->str, self->str, self->length);
5351 u1 = str1->str[0];
5352 u2 = str2->str[0];
5353 for (i = 0; i < u->length; i++)
5354 if (u->str[i] == u1) {
5355 if (--maxcount < 0)
5356 break;
5357 u->str[i] = u2;
5358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005360 i = fastsearch(
5361 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005363 if (i < 0)
5364 goto nothing;
5365 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5366 if (!u)
5367 return NULL;
5368 Py_UNICODE_COPY(u->str, self->str, self->length);
5369 while (i <= self->length - str1->length)
5370 if (Py_UNICODE_MATCH(self, i, str1)) {
5371 if (--maxcount < 0)
5372 break;
5373 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5374 i += str1->length;
5375 } else
5376 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005379
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005380 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005381 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 Py_UNICODE *p;
5383
5384 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005385 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 if (n > maxcount)
5387 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005388 if (n == 0)
5389 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005390 /* new_size = self->length + n * (str2->length - str1->length)); */
5391 delta = (str2->length - str1->length);
5392 if (delta == 0) {
5393 new_size = self->length;
5394 } else {
5395 product = n * (str2->length - str1->length);
5396 if ((product / (str2->length - str1->length)) != n) {
5397 PyErr_SetString(PyExc_OverflowError,
5398 "replace string is too long");
5399 return NULL;
5400 }
5401 new_size = self->length + product;
5402 if (new_size < 0) {
5403 PyErr_SetString(PyExc_OverflowError,
5404 "replace string is too long");
5405 return NULL;
5406 }
5407 }
5408 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005409 if (!u)
5410 return NULL;
5411 i = 0;
5412 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005413 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005414 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005415 while (n-- > 0) {
5416 /* look for next match */
5417 j = i;
5418 while (j <= e) {
5419 if (Py_UNICODE_MATCH(self, j, str1))
5420 break;
5421 j++;
5422 }
5423 if (j > i) {
5424 if (j > e)
5425 break;
5426 /* copy unchanged part [i:j] */
5427 Py_UNICODE_COPY(p, self->str+i, j-i);
5428 p += j - i;
5429 }
5430 /* copy substitution string */
5431 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005432 Py_UNICODE_COPY(p, str2->str, str2->length);
5433 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005434 }
5435 i = j + str1->length;
5436 }
5437 if (i < self->length)
5438 /* copy tail [i:] */
5439 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005440 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005441 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005442 while (n > 0) {
5443 Py_UNICODE_COPY(p, str2->str, str2->length);
5444 p += str2->length;
5445 if (--n <= 0)
5446 break;
5447 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005449 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 }
5451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005453
5454nothing:
5455 /* nothing to replace; return original string (when possible) */
5456 if (PyUnicode_CheckExact(self)) {
5457 Py_INCREF(self);
5458 return (PyObject *) self;
5459 }
5460 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461}
5462
5463/* --- Unicode Object Methods --------------------------------------------- */
5464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005465PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466"S.title() -> unicode\n\
5467\n\
5468Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005469characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470
5471static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005472unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 return fixup(self, fixtitle);
5475}
5476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005477PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478"S.capitalize() -> unicode\n\
5479\n\
5480Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005481have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
5483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005484unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 return fixup(self, fixcapitalize);
5487}
5488
5489#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005490PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491"S.capwords() -> unicode\n\
5492\n\
5493Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005494normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495
5496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005497unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498{
5499 PyObject *list;
5500 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005501 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 /* Split into words */
5504 list = split(self, NULL, -1);
5505 if (!list)
5506 return NULL;
5507
5508 /* Capitalize each word */
5509 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5510 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5511 fixcapitalize);
5512 if (item == NULL)
5513 goto onError;
5514 Py_DECREF(PyList_GET_ITEM(list, i));
5515 PyList_SET_ITEM(list, i, item);
5516 }
5517
5518 /* Join the words to form a new string */
5519 item = PyUnicode_Join(NULL, list);
5520
5521onError:
5522 Py_DECREF(list);
5523 return (PyObject *)item;
5524}
5525#endif
5526
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005527/* Argument converter. Coerces to a single unicode character */
5528
5529static int
5530convert_uc(PyObject *obj, void *addr)
5531{
5532 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5533 PyObject *uniobj;
5534 Py_UNICODE *unistr;
5535
5536 uniobj = PyUnicode_FromObject(obj);
5537 if (uniobj == NULL) {
5538 PyErr_SetString(PyExc_TypeError,
5539 "The fill character cannot be converted to Unicode");
5540 return 0;
5541 }
5542 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5543 PyErr_SetString(PyExc_TypeError,
5544 "The fill character must be exactly one character long");
5545 Py_DECREF(uniobj);
5546 return 0;
5547 }
5548 unistr = PyUnicode_AS_UNICODE(uniobj);
5549 *fillcharloc = unistr[0];
5550 Py_DECREF(uniobj);
5551 return 1;
5552}
5553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005554PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005555"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005557Return S centered in a Unicode string of length width. Padding is\n\
5558done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559
5560static PyObject *
5561unicode_center(PyUnicodeObject *self, PyObject *args)
5562{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005563 Py_ssize_t marg, left;
5564 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005565 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
Thomas Woutersde017742006-02-16 19:34:37 +00005567 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 return NULL;
5569
Tim Peters7a29bd52001-09-12 03:03:31 +00005570 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 Py_INCREF(self);
5572 return (PyObject*) self;
5573 }
5574
5575 marg = width - self->length;
5576 left = marg / 2 + (marg & width & 1);
5577
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005578 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579}
5580
Marc-André Lemburge5034372000-08-08 08:04:29 +00005581#if 0
5582
5583/* This code should go into some future Unicode collation support
5584 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005585 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005586
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005587/* speedy UTF-16 code point order comparison */
5588/* gleaned from: */
5589/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5590
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005591static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005592{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005593 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005594 0, 0, 0, 0, 0, 0, 0, 0,
5595 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005596 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005597};
5598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599static int
5600unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5601{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005602 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 Py_UNICODE *s1 = str1->str;
5605 Py_UNICODE *s2 = str2->str;
5606
5607 len1 = str1->length;
5608 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005609
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005611 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005612
5613 c1 = *s1++;
5614 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005615
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005616 if (c1 > (1<<11) * 26)
5617 c1 += utf16Fixup[c1>>11];
5618 if (c2 > (1<<11) * 26)
5619 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005620 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005621
5622 if (c1 != c2)
5623 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005624
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005625 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 }
5627
5628 return (len1 < len2) ? -1 : (len1 != len2);
5629}
5630
Marc-André Lemburge5034372000-08-08 08:04:29 +00005631#else
5632
5633static int
5634unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5635{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005637
5638 Py_UNICODE *s1 = str1->str;
5639 Py_UNICODE *s2 = str2->str;
5640
5641 len1 = str1->length;
5642 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005643
Marc-André Lemburge5034372000-08-08 08:04:29 +00005644 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005645 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005646
Fredrik Lundh45714e92001-06-26 16:39:36 +00005647 c1 = *s1++;
5648 c2 = *s2++;
5649
5650 if (c1 != c2)
5651 return (c1 < c2) ? -1 : 1;
5652
Marc-André Lemburge5034372000-08-08 08:04:29 +00005653 len1--; len2--;
5654 }
5655
5656 return (len1 < len2) ? -1 : (len1 != len2);
5657}
5658
5659#endif
5660
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661int PyUnicode_Compare(PyObject *left,
5662 PyObject *right)
5663{
5664 PyUnicodeObject *u = NULL, *v = NULL;
5665 int result;
5666
5667 /* Coerce the two arguments */
5668 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5669 if (u == NULL)
5670 goto onError;
5671 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5672 if (v == NULL)
5673 goto onError;
5674
Thomas Wouters7e474022000-07-16 12:04:32 +00005675 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 if (v == u) {
5677 Py_DECREF(u);
5678 Py_DECREF(v);
5679 return 0;
5680 }
5681
5682 result = unicode_compare(u, v);
5683
5684 Py_DECREF(u);
5685 Py_DECREF(v);
5686 return result;
5687
5688onError:
5689 Py_XDECREF(u);
5690 Py_XDECREF(v);
5691 return -1;
5692}
5693
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005694PyObject *PyUnicode_RichCompare(PyObject *left,
5695 PyObject *right,
5696 int op)
5697{
5698 int result;
5699
5700 result = PyUnicode_Compare(left, right);
5701 if (result == -1 && PyErr_Occurred())
5702 goto onError;
5703
5704 /* Convert the return value to a Boolean */
5705 switch (op) {
5706 case Py_EQ:
5707 result = (result == 0);
5708 break;
5709 case Py_NE:
5710 result = (result != 0);
5711 break;
5712 case Py_LE:
5713 result = (result <= 0);
5714 break;
5715 case Py_GE:
5716 result = (result >= 0);
5717 break;
5718 case Py_LT:
5719 result = (result == -1);
5720 break;
5721 case Py_GT:
5722 result = (result == 1);
5723 break;
5724 }
5725 return PyBool_FromLong(result);
5726
5727 onError:
5728
5729 /* Standard case
5730
5731 Type errors mean that PyUnicode_FromObject() could not convert
5732 one of the arguments (usually the right hand side) to Unicode,
5733 ie. we can't handle the comparison request. However, it is
5734 possible that the other object knows a comparison method, which
5735 is why we return Py_NotImplemented to give the other object a
5736 chance.
5737
5738 */
5739 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5740 PyErr_Clear();
5741 Py_INCREF(Py_NotImplemented);
5742 return Py_NotImplemented;
5743 }
5744 if (op != Py_EQ && op != Py_NE)
5745 return NULL;
5746
5747 /* Equality comparison.
5748
5749 This is a special case: we silence any PyExc_UnicodeDecodeError
5750 and instead turn it into a PyErr_UnicodeWarning.
5751
5752 */
5753 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5754 return NULL;
5755 PyErr_Clear();
5756 if (PyErr_Warn(PyExc_UnicodeWarning,
5757 (op == Py_EQ) ?
5758 "Unicode equal comparison "
5759 "failed to convert both arguments to Unicode - "
5760 "interpreting them as being unequal" :
5761 "Unicode unequal comparison "
5762 "failed to convert both arguments to Unicode - "
5763 "interpreting them as being unequal"
5764 ) < 0)
5765 return NULL;
5766 result = (op == Py_NE);
5767 return PyBool_FromLong(result);
5768}
5769
Guido van Rossum403d68b2000-03-13 15:55:09 +00005770int PyUnicode_Contains(PyObject *container,
5771 PyObject *element)
5772{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005773 PyObject *str, *sub;
5774 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005775
5776 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005777 sub = PyUnicode_FromObject(element);
5778 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005779 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005780 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005781 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005782 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005783
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005784 str = PyUnicode_FromObject(container);
5785 if (!str) {
5786 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005787 return -1;
5788 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005789
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005790 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005791
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005792 Py_DECREF(str);
5793 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005794
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005795 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005796}
5797
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798/* Concat to string or Unicode object giving a new Unicode object. */
5799
5800PyObject *PyUnicode_Concat(PyObject *left,
5801 PyObject *right)
5802{
5803 PyUnicodeObject *u = NULL, *v = NULL, *w;
5804
5805 /* Coerce the two arguments */
5806 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5807 if (u == NULL)
5808 goto onError;
5809 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5810 if (v == NULL)
5811 goto onError;
5812
5813 /* Shortcuts */
5814 if (v == unicode_empty) {
5815 Py_DECREF(v);
5816 return (PyObject *)u;
5817 }
5818 if (u == unicode_empty) {
5819 Py_DECREF(u);
5820 return (PyObject *)v;
5821 }
5822
5823 /* Concat the two Unicode strings */
5824 w = _PyUnicode_New(u->length + v->length);
5825 if (w == NULL)
5826 goto onError;
5827 Py_UNICODE_COPY(w->str, u->str, u->length);
5828 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5829
5830 Py_DECREF(u);
5831 Py_DECREF(v);
5832 return (PyObject *)w;
5833
5834onError:
5835 Py_XDECREF(u);
5836 Py_XDECREF(v);
5837 return NULL;
5838}
5839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005840PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841"S.count(sub[, start[, end]]) -> int\n\
5842\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005843Return the number of non-overlapping occurrences of substring sub in\n\
5844Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005845interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846
5847static PyObject *
5848unicode_count(PyUnicodeObject *self, PyObject *args)
5849{
5850 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005851 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005852 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 PyObject *result;
5854
Guido van Rossumb8872e62000-05-09 14:14:27 +00005855 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5856 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 return NULL;
5858
5859 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005860 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 if (substring == NULL)
5862 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005863
Fredrik Lundhc8162812006-05-26 19:33:03 +00005864 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005866 result = PyInt_FromSsize_t(
5867 stringlib_count(self->str + start, end - start,
5868 substring->str, substring->length)
5869 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005872
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 return result;
5874}
5875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005876PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005877"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005879Encodes S using the codec registered for encoding. encoding defaults\n\
5880to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005881handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5883'xmlcharrefreplace' as well as any other name registered with\n\
5884codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885
5886static PyObject *
5887unicode_encode(PyUnicodeObject *self, PyObject *args)
5888{
5889 char *encoding = NULL;
5890 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005891 PyObject *v;
5892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5894 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005895 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005896 if (v == NULL)
5897 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005898 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5899 PyErr_Format(PyExc_TypeError,
5900 "encoder did not return a string/unicode object "
5901 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005902 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005903 Py_DECREF(v);
5904 return NULL;
5905 }
5906 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005907
5908 onError:
5909 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005910}
5911
5912PyDoc_STRVAR(decode__doc__,
5913"S.decode([encoding[,errors]]) -> string or unicode\n\
5914\n\
5915Decodes S using the codec registered for encoding. encoding defaults\n\
5916to the default encoding. errors may be given to set a different error\n\
5917handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5918a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5919as well as any other name registerd with codecs.register_error that is\n\
5920able to handle UnicodeDecodeErrors.");
5921
5922static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005923unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005924{
5925 char *encoding = NULL;
5926 char *errors = NULL;
5927 PyObject *v;
5928
5929 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5930 return NULL;
5931 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005932 if (v == NULL)
5933 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005934 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5935 PyErr_Format(PyExc_TypeError,
5936 "decoder did not return a string/unicode object "
5937 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005938 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005939 Py_DECREF(v);
5940 return NULL;
5941 }
5942 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005943
5944 onError:
5945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005948PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949"S.expandtabs([tabsize]) -> unicode\n\
5950\n\
5951Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953
5954static PyObject*
5955unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5956{
5957 Py_UNICODE *e;
5958 Py_UNICODE *p;
5959 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005960 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 PyUnicodeObject *u;
5962 int tabsize = 8;
5963
5964 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5965 return NULL;
5966
Thomas Wouters7e474022000-07-16 12:04:32 +00005967 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005968 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 e = self->str + self->length;
5970 for (p = self->str; p < e; p++)
5971 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005972 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005974 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005975 PyErr_SetString(PyExc_OverflowError,
5976 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005977 return NULL;
5978 }
5979 old_j = j;
5980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 }
5982 else {
5983 j++;
5984 if (*p == '\n' || *p == '\r') {
5985 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005986 old_j = j = 0;
5987 if (i < 0) {
5988 PyErr_SetString(PyExc_OverflowError,
5989 "new string is too long");
5990 return NULL;
5991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 }
5993 }
5994
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005995 if ((i + j) < 0) {
5996 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5997 return NULL;
5998 }
5999
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 /* Second pass: create output string and fill it */
6001 u = _PyUnicode_New(i + j);
6002 if (!u)
6003 return NULL;
6004
6005 j = 0;
6006 q = u->str;
6007
6008 for (p = self->str; p < e; p++)
6009 if (*p == '\t') {
6010 if (tabsize > 0) {
6011 i = tabsize - (j % tabsize);
6012 j += i;
6013 while (i--)
6014 *q++ = ' ';
6015 }
6016 }
6017 else {
6018 j++;
6019 *q++ = *p;
6020 if (*p == '\n' || *p == '\r')
6021 j = 0;
6022 }
6023
6024 return (PyObject*) u;
6025}
6026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006027PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028"S.find(sub [,start [,end]]) -> int\n\
6029\n\
6030Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006031such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032arguments start and end are interpreted as in slice notation.\n\
6033\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006034Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
6036static PyObject *
6037unicode_find(PyUnicodeObject *self, PyObject *args)
6038{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006039 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006040 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006041 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006042 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
Guido van Rossumb8872e62000-05-09 14:14:27 +00006044 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6045 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006047 substring = PyUnicode_FromObject(substring);
6048 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 return NULL;
6050
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006051 result = stringlib_find_slice(
6052 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6053 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6054 start, end
6055 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056
6057 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006058
6059 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060}
6061
6062static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006063unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064{
6065 if (index < 0 || index >= self->length) {
6066 PyErr_SetString(PyExc_IndexError, "string index out of range");
6067 return NULL;
6068 }
6069
6070 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6071}
6072
6073static long
6074unicode_hash(PyUnicodeObject *self)
6075{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006076 /* Since Unicode objects compare equal to their ASCII string
6077 counterparts, they should use the individual character values
6078 as basis for their hash value. This is needed to assure that
6079 strings and Unicode objects behave in the same way as
6080 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
Martin v. Löwis18e16552006-02-15 17:27:45 +00006082 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006083 register Py_UNICODE *p;
6084 register long x;
6085
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 if (self->hash != -1)
6087 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006088 len = PyUnicode_GET_SIZE(self);
6089 p = PyUnicode_AS_UNICODE(self);
6090 x = *p << 7;
6091 while (--len >= 0)
6092 x = (1000003*x) ^ *p++;
6093 x ^= PyUnicode_GET_SIZE(self);
6094 if (x == -1)
6095 x = -2;
6096 self->hash = x;
6097 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006100PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101"S.index(sub [,start [,end]]) -> int\n\
6102\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
6105static PyObject *
6106unicode_index(PyUnicodeObject *self, PyObject *args)
6107{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006109 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006110 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006111 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112
Guido van Rossumb8872e62000-05-09 14:14:27 +00006113 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6114 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006116 substring = PyUnicode_FromObject(substring);
6117 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 return NULL;
6119
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006120 result = stringlib_find_slice(
6121 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6122 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6123 start, end
6124 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
6126 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006127
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 if (result < 0) {
6129 PyErr_SetString(PyExc_ValueError, "substring not found");
6130 return NULL;
6131 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006132
Martin v. Löwis18e16552006-02-15 17:27:45 +00006133 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134}
6135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006136PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006137"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006139Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006140at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141
6142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006143unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144{
6145 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6146 register const Py_UNICODE *e;
6147 int cased;
6148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 /* Shortcut for single character strings */
6150 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006151 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006153 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006154 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006155 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006156
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 e = p + PyUnicode_GET_SIZE(self);
6158 cased = 0;
6159 for (; p < e; p++) {
6160 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006161
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006163 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 else if (!cased && Py_UNICODE_ISLOWER(ch))
6165 cased = 1;
6166 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006167 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168}
6169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006170PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006171"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006173Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006174at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175
6176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006177unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178{
6179 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6180 register const Py_UNICODE *e;
6181 int cased;
6182
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 /* Shortcut for single character strings */
6184 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006185 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006187 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006188 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006189 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006190
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 e = p + PyUnicode_GET_SIZE(self);
6192 cased = 0;
6193 for (; p < e; p++) {
6194 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006195
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006197 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 else if (!cased && Py_UNICODE_ISUPPER(ch))
6199 cased = 1;
6200 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006201 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202}
6203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006204PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006205"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006207Return True if S is a titlecased string and there is at least one\n\
6208character in S, i.e. upper- and titlecase characters may only\n\
6209follow uncased characters and lowercase characters only cased ones.\n\
6210Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
6212static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006213unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214{
6215 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6216 register const Py_UNICODE *e;
6217 int cased, previous_is_cased;
6218
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 /* Shortcut for single character strings */
6220 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006221 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6222 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006224 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006225 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006226 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006227
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 e = p + PyUnicode_GET_SIZE(self);
6229 cased = 0;
6230 previous_is_cased = 0;
6231 for (; p < e; p++) {
6232 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006233
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6235 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006236 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 previous_is_cased = 1;
6238 cased = 1;
6239 }
6240 else if (Py_UNICODE_ISLOWER(ch)) {
6241 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006242 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 previous_is_cased = 1;
6244 cased = 1;
6245 }
6246 else
6247 previous_is_cased = 0;
6248 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006249 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250}
6251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006252PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006253"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006255Return True if all characters in S are whitespace\n\
6256and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257
6258static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006259unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
6261 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6262 register const Py_UNICODE *e;
6263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 /* Shortcut for single character strings */
6265 if (PyUnicode_GET_SIZE(self) == 1 &&
6266 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006267 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006269 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006270 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006271 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006272
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 e = p + PyUnicode_GET_SIZE(self);
6274 for (; p < e; p++) {
6275 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006276 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006278 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279}
6280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006281PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006282"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006283\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006284Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006285and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006286
6287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006288unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006289{
6290 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6291 register const Py_UNICODE *e;
6292
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006293 /* Shortcut for single character strings */
6294 if (PyUnicode_GET_SIZE(self) == 1 &&
6295 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006296 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006297
6298 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006299 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006300 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006301
6302 e = p + PyUnicode_GET_SIZE(self);
6303 for (; p < e; p++) {
6304 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006305 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006306 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006307 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006308}
6309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006310PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006311"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006312\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006313Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006314and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006315
6316static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006317unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006318{
6319 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6320 register const Py_UNICODE *e;
6321
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006322 /* Shortcut for single character strings */
6323 if (PyUnicode_GET_SIZE(self) == 1 &&
6324 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006325 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006326
6327 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006328 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006329 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006330
6331 e = p + PyUnicode_GET_SIZE(self);
6332 for (; p < e; p++) {
6333 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006334 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006335 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006336 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006337}
6338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006339PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006340"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006342Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006343False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344
6345static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006346unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
6348 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6349 register const Py_UNICODE *e;
6350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 /* Shortcut for single character strings */
6352 if (PyUnicode_GET_SIZE(self) == 1 &&
6353 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006354 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006356 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006357 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006358 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006359
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 e = p + PyUnicode_GET_SIZE(self);
6361 for (; p < e; p++) {
6362 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006363 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006365 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366}
6367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006368PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006369"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006371Return True if all characters in S are digits\n\
6372and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373
6374static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006375unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376{
6377 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6378 register const Py_UNICODE *e;
6379
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 /* Shortcut for single character strings */
6381 if (PyUnicode_GET_SIZE(self) == 1 &&
6382 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006383 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006385 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006386 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006387 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006388
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 e = p + PyUnicode_GET_SIZE(self);
6390 for (; p < e; p++) {
6391 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006392 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006394 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395}
6396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006397PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006398"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006400Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006401False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402
6403static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006404unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405{
6406 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6407 register const Py_UNICODE *e;
6408
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 /* Shortcut for single character strings */
6410 if (PyUnicode_GET_SIZE(self) == 1 &&
6411 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006412 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006414 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006415 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006416 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 e = p + PyUnicode_GET_SIZE(self);
6419 for (; p < e; p++) {
6420 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006421 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006423 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424}
6425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427"S.join(sequence) -> unicode\n\
6428\n\
6429Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431
6432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006433unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006435 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436}
6437
Martin v. Löwis18e16552006-02-15 17:27:45 +00006438static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439unicode_length(PyUnicodeObject *self)
6440{
6441 return self->length;
6442}
6443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006444PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006445"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446\n\
6447Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006448done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449
6450static PyObject *
6451unicode_ljust(PyUnicodeObject *self, PyObject *args)
6452{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006453 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006454 Py_UNICODE fillchar = ' ';
6455
Martin v. Löwis412fb672006-04-13 06:34:32 +00006456 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 return NULL;
6458
Tim Peters7a29bd52001-09-12 03:03:31 +00006459 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 Py_INCREF(self);
6461 return (PyObject*) self;
6462 }
6463
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006464 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465}
6466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006467PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468"S.lower() -> unicode\n\
6469\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006470Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006473unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 return fixup(self, fixlower);
6476}
6477
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006478#define LEFTSTRIP 0
6479#define RIGHTSTRIP 1
6480#define BOTHSTRIP 2
6481
6482/* Arrays indexed by above */
6483static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6484
6485#define STRIPNAME(i) (stripformat[i]+3)
6486
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006487/* externally visible for str.strip(unicode) */
6488PyObject *
6489_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6490{
6491 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006493 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006494 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6495 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006496
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006497 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6498
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006499 i = 0;
6500 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006501 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6502 i++;
6503 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006504 }
6505
6506 j = len;
6507 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006508 do {
6509 j--;
6510 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6511 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006512 }
6513
6514 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006515 Py_INCREF(self);
6516 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006517 }
6518 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006519 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006520}
6521
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522
6523static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006524do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006526 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006527 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006528
6529 i = 0;
6530 if (striptype != RIGHTSTRIP) {
6531 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6532 i++;
6533 }
6534 }
6535
6536 j = len;
6537 if (striptype != LEFTSTRIP) {
6538 do {
6539 j--;
6540 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6541 j++;
6542 }
6543
6544 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6545 Py_INCREF(self);
6546 return (PyObject*)self;
6547 }
6548 else
6549 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006552
6553static PyObject *
6554do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6555{
6556 PyObject *sep = NULL;
6557
6558 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6559 return NULL;
6560
6561 if (sep != NULL && sep != Py_None) {
6562 if (PyUnicode_Check(sep))
6563 return _PyUnicode_XStrip(self, striptype, sep);
6564 else if (PyString_Check(sep)) {
6565 PyObject *res;
6566 sep = PyUnicode_FromObject(sep);
6567 if (sep==NULL)
6568 return NULL;
6569 res = _PyUnicode_XStrip(self, striptype, sep);
6570 Py_DECREF(sep);
6571 return res;
6572 }
6573 else {
6574 PyErr_Format(PyExc_TypeError,
6575 "%s arg must be None, unicode or str",
6576 STRIPNAME(striptype));
6577 return NULL;
6578 }
6579 }
6580
6581 return do_strip(self, striptype);
6582}
6583
6584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006585PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006586"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006587\n\
6588Return a copy of the string S with leading and trailing\n\
6589whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006590If chars is given and not None, remove characters in chars instead.\n\
6591If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006592
6593static PyObject *
6594unicode_strip(PyUnicodeObject *self, PyObject *args)
6595{
6596 if (PyTuple_GET_SIZE(args) == 0)
6597 return do_strip(self, BOTHSTRIP); /* Common case */
6598 else
6599 return do_argstrip(self, BOTHSTRIP, args);
6600}
6601
6602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006603PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006604"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006605\n\
6606Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006607If chars is given and not None, remove characters in chars instead.\n\
6608If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006609
6610static PyObject *
6611unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6612{
6613 if (PyTuple_GET_SIZE(args) == 0)
6614 return do_strip(self, LEFTSTRIP); /* Common case */
6615 else
6616 return do_argstrip(self, LEFTSTRIP, args);
6617}
6618
6619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006620PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006621"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006622\n\
6623Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006624If chars is given and not None, remove characters in chars instead.\n\
6625If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006626
6627static PyObject *
6628unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6629{
6630 if (PyTuple_GET_SIZE(args) == 0)
6631 return do_strip(self, RIGHTSTRIP); /* Common case */
6632 else
6633 return do_argstrip(self, RIGHTSTRIP, args);
6634}
6635
6636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006638unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
6640 PyUnicodeObject *u;
6641 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006642 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006643 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644
6645 if (len < 0)
6646 len = 0;
6647
Tim Peters7a29bd52001-09-12 03:03:31 +00006648 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 /* no repeat, return original string */
6650 Py_INCREF(str);
6651 return (PyObject*) str;
6652 }
Tim Peters8f422462000-09-09 06:13:41 +00006653
6654 /* ensure # of chars needed doesn't overflow int and # of bytes
6655 * needed doesn't overflow size_t
6656 */
6657 nchars = len * str->length;
6658 if (len && nchars / len != str->length) {
6659 PyErr_SetString(PyExc_OverflowError,
6660 "repeated string is too long");
6661 return NULL;
6662 }
6663 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6664 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6665 PyErr_SetString(PyExc_OverflowError,
6666 "repeated string is too long");
6667 return NULL;
6668 }
6669 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 if (!u)
6671 return NULL;
6672
6673 p = u->str;
6674
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006675 if (str->length == 1 && len > 0) {
6676 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006677 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006678 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006679 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006680 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006681 done = str->length;
6682 }
6683 while (done < nchars) {
6684 int n = (done <= nchars-done) ? done : nchars-done;
6685 Py_UNICODE_COPY(p+done, p, n);
6686 done += n;
6687 }
6688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689
6690 return (PyObject*) u;
6691}
6692
6693PyObject *PyUnicode_Replace(PyObject *obj,
6694 PyObject *subobj,
6695 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697{
6698 PyObject *self;
6699 PyObject *str1;
6700 PyObject *str2;
6701 PyObject *result;
6702
6703 self = PyUnicode_FromObject(obj);
6704 if (self == NULL)
6705 return NULL;
6706 str1 = PyUnicode_FromObject(subobj);
6707 if (str1 == NULL) {
6708 Py_DECREF(self);
6709 return NULL;
6710 }
6711 str2 = PyUnicode_FromObject(replobj);
6712 if (str2 == NULL) {
6713 Py_DECREF(self);
6714 Py_DECREF(str1);
6715 return NULL;
6716 }
Tim Petersced69f82003-09-16 20:30:58 +00006717 result = replace((PyUnicodeObject *)self,
6718 (PyUnicodeObject *)str1,
6719 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 maxcount);
6721 Py_DECREF(self);
6722 Py_DECREF(str1);
6723 Py_DECREF(str2);
6724 return result;
6725}
6726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006727PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728"S.replace (old, new[, maxsplit]) -> unicode\n\
6729\n\
6730Return a copy of S with all occurrences of substring\n\
6731old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733
6734static PyObject*
6735unicode_replace(PyUnicodeObject *self, PyObject *args)
6736{
6737 PyUnicodeObject *str1;
6738 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006739 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 PyObject *result;
6741
Martin v. Löwis18e16552006-02-15 17:27:45 +00006742 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 return NULL;
6744 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6745 if (str1 == NULL)
6746 return NULL;
6747 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006748 if (str2 == NULL) {
6749 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752
6753 result = replace(self, str1, str2, maxcount);
6754
6755 Py_DECREF(str1);
6756 Py_DECREF(str2);
6757 return result;
6758}
6759
6760static
6761PyObject *unicode_repr(PyObject *unicode)
6762{
6763 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6764 PyUnicode_GET_SIZE(unicode),
6765 1);
6766}
6767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006768PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769"S.rfind(sub [,start [,end]]) -> int\n\
6770\n\
6771Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006772such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773arguments start and end are interpreted as in slice notation.\n\
6774\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776
6777static PyObject *
6778unicode_rfind(PyUnicodeObject *self, PyObject *args)
6779{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006780 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006781 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006782 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006783 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
Guido van Rossumb8872e62000-05-09 14:14:27 +00006785 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6786 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006788 substring = PyUnicode_FromObject(substring);
6789 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 return NULL;
6791
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006792 result = stringlib_rfind_slice(
6793 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6794 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6795 start, end
6796 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797
6798 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006799
6800 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801}
6802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804"S.rindex(sub [,start [,end]]) -> int\n\
6805\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807
6808static PyObject *
6809unicode_rindex(PyUnicodeObject *self, PyObject *args)
6810{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006811 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006812 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006813 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006814 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
Guido van Rossumb8872e62000-05-09 14:14:27 +00006816 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6817 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006819 substring = PyUnicode_FromObject(substring);
6820 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 return NULL;
6822
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006823 result = stringlib_rfind_slice(
6824 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6825 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6826 start, end
6827 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828
6829 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831 if (result < 0) {
6832 PyErr_SetString(PyExc_ValueError, "substring not found");
6833 return NULL;
6834 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006835 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836}
6837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006838PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006839"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840\n\
6841Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006842done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
6844static PyObject *
6845unicode_rjust(PyUnicodeObject *self, PyObject *args)
6846{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006847 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006848 Py_UNICODE fillchar = ' ';
6849
Martin v. Löwis412fb672006-04-13 06:34:32 +00006850 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 return NULL;
6852
Tim Peters7a29bd52001-09-12 03:03:31 +00006853 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 Py_INCREF(self);
6855 return (PyObject*) self;
6856 }
6857
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006858 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859}
6860
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006862unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863{
6864 /* standard clamping */
6865 if (start < 0)
6866 start = 0;
6867 if (end < 0)
6868 end = 0;
6869 if (end > self->length)
6870 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006871 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 /* full slice, return original string */
6873 Py_INCREF(self);
6874 return (PyObject*) self;
6875 }
6876 if (start > end)
6877 start = end;
6878 /* copy slice */
6879 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6880 end - start);
6881}
6882
6883PyObject *PyUnicode_Split(PyObject *s,
6884 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006885 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 s = PyUnicode_FromObject(s);
6890 if (s == NULL)
6891 return NULL;
6892 if (sep != NULL) {
6893 sep = PyUnicode_FromObject(sep);
6894 if (sep == NULL) {
6895 Py_DECREF(s);
6896 return NULL;
6897 }
6898 }
6899
6900 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6901
6902 Py_DECREF(s);
6903 Py_XDECREF(sep);
6904 return result;
6905}
6906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006907PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908"S.split([sep [,maxsplit]]) -> list of strings\n\
6909\n\
6910Return a list of the words in S, using sep as the\n\
6911delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006912splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006913any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914
6915static PyObject*
6916unicode_split(PyUnicodeObject *self, PyObject *args)
6917{
6918 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
Martin v. Löwis18e16552006-02-15 17:27:45 +00006921 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 return NULL;
6923
6924 if (substring == Py_None)
6925 return split(self, NULL, maxcount);
6926 else if (PyUnicode_Check(substring))
6927 return split(self, (PyUnicodeObject *)substring, maxcount);
6928 else
6929 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6930}
6931
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006932PyObject *
6933PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6934{
6935 PyObject* str_obj;
6936 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006937 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006938
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006939 str_obj = PyUnicode_FromObject(str_in);
6940 if (!str_obj)
6941 return NULL;
6942 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006943 if (!sep_obj) {
6944 Py_DECREF(str_obj);
6945 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006946 }
6947
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006948 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006949 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6950 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6951 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006952
Fredrik Lundhb9479482006-05-26 17:22:38 +00006953 Py_DECREF(sep_obj);
6954 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006955
6956 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006957}
6958
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006959
6960PyObject *
6961PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6962{
6963 PyObject* str_obj;
6964 PyObject* sep_obj;
6965 PyObject* out;
6966
6967 str_obj = PyUnicode_FromObject(str_in);
6968 if (!str_obj)
6969 return NULL;
6970 sep_obj = PyUnicode_FromObject(sep_in);
6971 if (!sep_obj) {
6972 Py_DECREF(str_obj);
6973 return NULL;
6974 }
6975
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006976 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006977 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6978 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6979 );
6980
6981 Py_DECREF(sep_obj);
6982 Py_DECREF(str_obj);
6983
6984 return out;
6985}
6986
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006987PyDoc_STRVAR(partition__doc__,
6988"S.partition(sep) -> (head, sep, tail)\n\
6989\n\
6990Searches for the separator sep in S, and returns the part before it,\n\
6991the separator itself, and the part after it. If the separator is not\n\
6992found, returns S and two empty strings.");
6993
6994static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006995unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006996{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006997 return PyUnicode_Partition((PyObject *)self, separator);
6998}
6999
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007000PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007001"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007002\n\
7003Searches for the separator sep in S, starting at the end of S, and returns\n\
7004the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007005separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007006
7007static PyObject*
7008unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7009{
7010 return PyUnicode_RPartition((PyObject *)self, separator);
7011}
7012
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007013PyObject *PyUnicode_RSplit(PyObject *s,
7014 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007015 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007016{
7017 PyObject *result;
7018
7019 s = PyUnicode_FromObject(s);
7020 if (s == NULL)
7021 return NULL;
7022 if (sep != NULL) {
7023 sep = PyUnicode_FromObject(sep);
7024 if (sep == NULL) {
7025 Py_DECREF(s);
7026 return NULL;
7027 }
7028 }
7029
7030 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7031
7032 Py_DECREF(s);
7033 Py_XDECREF(sep);
7034 return result;
7035}
7036
7037PyDoc_STRVAR(rsplit__doc__,
7038"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7039\n\
7040Return a list of the words in S, using sep as the\n\
7041delimiter string, starting at the end of the string and\n\
7042working to the front. If maxsplit is given, at most maxsplit\n\
7043splits are done. If sep is not specified, any whitespace string\n\
7044is a separator.");
7045
7046static PyObject*
7047unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7048{
7049 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007051
Martin v. Löwis18e16552006-02-15 17:27:45 +00007052 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007053 return NULL;
7054
7055 if (substring == Py_None)
7056 return rsplit(self, NULL, maxcount);
7057 else if (PyUnicode_Check(substring))
7058 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7059 else
7060 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7061}
7062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007063PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007064"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065\n\
7066Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007067Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007068is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069
7070static PyObject*
7071unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7072{
Guido van Rossum86662912000-04-11 15:38:46 +00007073 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
Guido van Rossum86662912000-04-11 15:38:46 +00007075 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 return NULL;
7077
Guido van Rossum86662912000-04-11 15:38:46 +00007078 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079}
7080
7081static
7082PyObject *unicode_str(PyUnicodeObject *self)
7083{
Fred Drakee4315f52000-05-09 19:53:39 +00007084 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085}
7086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088"S.swapcase() -> unicode\n\
7089\n\
7090Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007091and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
7093static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007094unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 return fixup(self, fixswapcase);
7097}
7098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007099PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100"S.translate(table) -> unicode\n\
7101\n\
7102Return a copy of the string S, where all characters have been mapped\n\
7103through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007104Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7105Unmapped characters are left untouched. Characters mapped to None\n\
7106are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107
7108static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007109unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110{
Tim Petersced69f82003-09-16 20:30:58 +00007111 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007113 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114 "ignore");
7115}
7116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007117PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118"S.upper() -> unicode\n\
7119\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007120Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007123unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 return fixup(self, fixupper);
7126}
7127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007128PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129"S.zfill(width) -> unicode\n\
7130\n\
7131Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134static PyObject *
7135unicode_zfill(PyUnicodeObject *self, PyObject *args)
7136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007137 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 PyUnicodeObject *u;
7139
Martin v. Löwis18e16552006-02-15 17:27:45 +00007140 Py_ssize_t width;
7141 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 return NULL;
7143
7144 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007145 if (PyUnicode_CheckExact(self)) {
7146 Py_INCREF(self);
7147 return (PyObject*) self;
7148 }
7149 else
7150 return PyUnicode_FromUnicode(
7151 PyUnicode_AS_UNICODE(self),
7152 PyUnicode_GET_SIZE(self)
7153 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 }
7155
7156 fill = width - self->length;
7157
7158 u = pad(self, fill, 0, '0');
7159
Walter Dörwald068325e2002-04-15 13:36:47 +00007160 if (u == NULL)
7161 return NULL;
7162
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 if (u->str[fill] == '+' || u->str[fill] == '-') {
7164 /* move sign to beginning of string */
7165 u->str[0] = u->str[fill];
7166 u->str[fill] = '0';
7167 }
7168
7169 return (PyObject*) u;
7170}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171
7172#if 0
7173static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007174unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 return PyInt_FromLong(unicode_freelist_size);
7177}
7178#endif
7179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007181"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007183Return True if S starts with the specified prefix, False otherwise.\n\
7184With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007185With optional end, stop comparing S at that position.\n\
7186prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187
7188static PyObject *
7189unicode_startswith(PyUnicodeObject *self,
7190 PyObject *args)
7191{
Georg Brandl24250812006-06-09 18:45:48 +00007192 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007195 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007196 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
Georg Brandl24250812006-06-09 18:45:48 +00007198 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007199 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007201 if (PyTuple_Check(subobj)) {
7202 Py_ssize_t i;
7203 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7204 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7205 PyTuple_GET_ITEM(subobj, i));
7206 if (substring == NULL)
7207 return NULL;
7208 result = tailmatch(self, substring, start, end, -1);
7209 Py_DECREF(substring);
7210 if (result) {
7211 Py_RETURN_TRUE;
7212 }
7213 }
7214 /* nothing matched */
7215 Py_RETURN_FALSE;
7216 }
7217 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007219 return NULL;
7220 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007222 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223}
7224
7225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007226PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007227"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007229Return True if S ends with the specified suffix, False otherwise.\n\
7230With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007231With optional end, stop comparing S at that position.\n\
7232suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233
7234static PyObject *
7235unicode_endswith(PyUnicodeObject *self,
7236 PyObject *args)
7237{
Georg Brandl24250812006-06-09 18:45:48 +00007238 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007240 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007241 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007242 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243
Georg Brandl24250812006-06-09 18:45:48 +00007244 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7245 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007247 if (PyTuple_Check(subobj)) {
7248 Py_ssize_t i;
7249 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7250 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7251 PyTuple_GET_ITEM(subobj, i));
7252 if (substring == NULL)
7253 return NULL;
7254 result = tailmatch(self, substring, start, end, +1);
7255 Py_DECREF(substring);
7256 if (result) {
7257 Py_RETURN_TRUE;
7258 }
7259 }
7260 Py_RETURN_FALSE;
7261 }
7262 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
Georg Brandl24250812006-06-09 18:45:48 +00007266 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007268 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269}
7270
7271
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007272
7273static PyObject *
7274unicode_getnewargs(PyUnicodeObject *v)
7275{
7276 return Py_BuildValue("(u#)", v->str, v->length);
7277}
7278
7279
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280static PyMethodDef unicode_methods[] = {
7281
7282 /* Order is according to common usage: often used methods should
7283 appear first, since lookup is done sequentially. */
7284
Georg Brandlecdc0a92006-03-30 12:19:07 +00007285 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007286 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7287 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007288 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007289 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7290 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7291 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7292 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7293 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7294 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7295 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007296 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007297 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7298 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7299 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007300 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007301 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007302/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7303 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7304 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7305 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007306 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007307 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007308 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007309 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007310 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7311 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7312 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7313 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7314 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7315 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7316 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7317 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7318 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7319 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7320 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7321 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7322 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7323 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007324 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007325#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007326 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327#endif
7328
7329#if 0
7330 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007331 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332#endif
7333
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007334 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 {NULL, NULL}
7336};
7337
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007338static PyObject *
7339unicode_mod(PyObject *v, PyObject *w)
7340{
7341 if (!PyUnicode_Check(v)) {
7342 Py_INCREF(Py_NotImplemented);
7343 return Py_NotImplemented;
7344 }
7345 return PyUnicode_Format(v, w);
7346}
7347
7348static PyNumberMethods unicode_as_number = {
7349 0, /*nb_add*/
7350 0, /*nb_subtract*/
7351 0, /*nb_multiply*/
7352 0, /*nb_divide*/
7353 unicode_mod, /*nb_remainder*/
7354};
7355
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007358 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007359 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7360 (ssizeargfunc) unicode_getitem, /* sq_item */
7361 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 0, /* sq_ass_item */
7363 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007364 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365};
7366
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007367static PyObject*
7368unicode_subscript(PyUnicodeObject* self, PyObject* item)
7369{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007370 if (PyIndex_Check(item)) {
7371 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007372 if (i == -1 && PyErr_Occurred())
7373 return NULL;
7374 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007375 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007376 return unicode_getitem(self, i);
7377 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007378 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007379 Py_UNICODE* source_buf;
7380 Py_UNICODE* result_buf;
7381 PyObject* result;
7382
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007383 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007384 &start, &stop, &step, &slicelength) < 0) {
7385 return NULL;
7386 }
7387
7388 if (slicelength <= 0) {
7389 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007390 } else if (start == 0 && step == 1 && slicelength == self->length &&
7391 PyUnicode_CheckExact(self)) {
7392 Py_INCREF(self);
7393 return (PyObject *)self;
7394 } else if (step == 1) {
7395 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007396 } else {
7397 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007398 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7399 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007400
7401 if (result_buf == NULL)
7402 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007403
7404 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7405 result_buf[i] = source_buf[cur];
7406 }
Tim Petersced69f82003-09-16 20:30:58 +00007407
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007408 result = PyUnicode_FromUnicode(result_buf, slicelength);
7409 PyMem_FREE(result_buf);
7410 return result;
7411 }
7412 } else {
7413 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7414 return NULL;
7415 }
7416}
7417
7418static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007419 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007420 (binaryfunc)unicode_subscript, /* mp_subscript */
7421 (objobjargproc)0, /* mp_ass_subscript */
7422};
7423
Martin v. Löwis18e16552006-02-15 17:27:45 +00007424static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007426 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427 const void **ptr)
7428{
7429 if (index != 0) {
7430 PyErr_SetString(PyExc_SystemError,
7431 "accessing non-existent unicode segment");
7432 return -1;
7433 }
7434 *ptr = (void *) self->str;
7435 return PyUnicode_GET_DATA_SIZE(self);
7436}
7437
Martin v. Löwis18e16552006-02-15 17:27:45 +00007438static Py_ssize_t
7439unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 const void **ptr)
7441{
7442 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007443 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 return -1;
7445}
7446
7447static int
7448unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007449 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450{
7451 if (lenp)
7452 *lenp = PyUnicode_GET_DATA_SIZE(self);
7453 return 1;
7454}
7455
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007456static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007458 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 const void **ptr)
7460{
7461 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007462
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 if (index != 0) {
7464 PyErr_SetString(PyExc_SystemError,
7465 "accessing non-existent unicode segment");
7466 return -1;
7467 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007468 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 if (str == NULL)
7470 return -1;
7471 *ptr = (void *) PyString_AS_STRING(str);
7472 return PyString_GET_SIZE(str);
7473}
7474
7475/* Helpers for PyUnicode_Format() */
7476
7477static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007478getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007480 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 if (argidx < arglen) {
7482 (*p_argidx)++;
7483 if (arglen < 0)
7484 return args;
7485 else
7486 return PyTuple_GetItem(args, argidx);
7487 }
7488 PyErr_SetString(PyExc_TypeError,
7489 "not enough arguments for format string");
7490 return NULL;
7491}
7492
7493#define F_LJUST (1<<0)
7494#define F_SIGN (1<<1)
7495#define F_BLANK (1<<2)
7496#define F_ALT (1<<3)
7497#define F_ZERO (1<<4)
7498
Martin v. Löwis18e16552006-02-15 17:27:45 +00007499static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007500strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502 register Py_ssize_t i;
7503 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 for (i = len - 1; i >= 0; i--)
7505 buffer[i] = (Py_UNICODE) charbuffer[i];
7506
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507 return len;
7508}
7509
Neal Norwitzfc76d632006-01-10 06:03:13 +00007510static int
7511doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7512{
Tim Peters15231542006-02-16 01:08:01 +00007513 Py_ssize_t result;
7514
Neal Norwitzfc76d632006-01-10 06:03:13 +00007515 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007516 result = strtounicode(buffer, (char *)buffer);
7517 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007518}
7519
7520static int
7521longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7522{
Tim Peters15231542006-02-16 01:08:01 +00007523 Py_ssize_t result;
7524
Neal Norwitzfc76d632006-01-10 06:03:13 +00007525 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007526 result = strtounicode(buffer, (char *)buffer);
7527 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007528}
7529
Guido van Rossum078151d2002-08-11 04:24:12 +00007530/* XXX To save some code duplication, formatfloat/long/int could have been
7531 shared with stringobject.c, converting from 8-bit to Unicode after the
7532 formatting is done. */
7533
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534static int
7535formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007536 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 int flags,
7538 int prec,
7539 int type,
7540 PyObject *v)
7541{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007542 /* fmt = '%#.' + `prec` + `type`
7543 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 char fmt[20];
7545 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007546
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 x = PyFloat_AsDouble(v);
7548 if (x == -1.0 && PyErr_Occurred())
7549 return -1;
7550 if (prec < 0)
7551 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7553 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007554 /* Worst case length calc to ensure no buffer overrun:
7555
7556 'g' formats:
7557 fmt = %#.<prec>g
7558 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7559 for any double rep.)
7560 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7561
7562 'f' formats:
7563 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7564 len = 1 + 50 + 1 + prec = 52 + prec
7565
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007566 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007567 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007568
7569 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00007570 if (((type == 'g' || type == 'G') &&
7571 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007572 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007573 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007574 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007575 return -1;
7576 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007577 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7578 (flags&F_ALT) ? "#" : "",
7579 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007580 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581}
7582
Tim Peters38fd5b62000-09-21 05:43:11 +00007583static PyObject*
7584formatlong(PyObject *val, int flags, int prec, int type)
7585{
7586 char *buf;
7587 int i, len;
7588 PyObject *str; /* temporary string object. */
7589 PyUnicodeObject *result;
7590
7591 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7592 if (!str)
7593 return NULL;
7594 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007595 if (!result) {
7596 Py_DECREF(str);
7597 return NULL;
7598 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007599 for (i = 0; i < len; i++)
7600 result->str[i] = buf[i];
7601 result->str[len] = 0;
7602 Py_DECREF(str);
7603 return (PyObject*)result;
7604}
7605
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606static int
7607formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007608 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 int flags,
7610 int prec,
7611 int type,
7612 PyObject *v)
7613{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007614 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007615 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7616 * + 1 + 1
7617 * = 24
7618 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007619 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007620 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621 long x;
7622
7623 x = PyInt_AsLong(v);
7624 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007625 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007626 if (x < 0 && type == 'u') {
7627 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007628 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007629 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7630 sign = "-";
7631 else
7632 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007634 prec = 1;
7635
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007636 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7637 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007638 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007639 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007640 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007641 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007642 return -1;
7643 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007644
7645 if ((flags & F_ALT) &&
7646 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007647 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007648 * of issues that cause pain:
7649 * - when 0 is being converted, the C standard leaves off
7650 * the '0x' or '0X', which is inconsistent with other
7651 * %#x/%#X conversions and inconsistent with Python's
7652 * hex() function
7653 * - there are platforms that violate the standard and
7654 * convert 0 with the '0x' or '0X'
7655 * (Metrowerks, Compaq Tru64)
7656 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007657 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007658 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007659 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007660 * We can achieve the desired consistency by inserting our
7661 * own '0x' or '0X' prefix, and substituting %x/%X in place
7662 * of %#x/%#X.
7663 *
7664 * Note that this is the same approach as used in
7665 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007666 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007667 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7668 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007669 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007670 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007671 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7672 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007673 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007674 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007675 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007676 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007677 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007678 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679}
7680
7681static int
7682formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007683 size_t buflen,
7684 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007686 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007687 if (PyUnicode_Check(v)) {
7688 if (PyUnicode_GET_SIZE(v) != 1)
7689 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007693 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007694 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007695 goto onError;
7696 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698
7699 else {
7700 /* Integer input truncated to a character */
7701 long x;
7702 x = PyInt_AsLong(v);
7703 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007704 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007705#ifdef Py_UNICODE_WIDE
7706 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007707 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007708 "%c arg not in range(0x110000) "
7709 "(wide Python build)");
7710 return -1;
7711 }
7712#else
7713 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007714 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007715 "%c arg not in range(0x10000) "
7716 "(narrow Python build)");
7717 return -1;
7718 }
7719#endif
7720 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721 }
7722 buf[1] = '\0';
7723 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007724
7725 onError:
7726 PyErr_SetString(PyExc_TypeError,
7727 "%c requires int or char");
7728 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729}
7730
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007731/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7732
7733 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7734 chars are formatted. XXX This is a magic number. Each formatting
7735 routine does bounds checking to ensure no overflow, but a better
7736 solution may be to malloc a buffer of appropriate size for each
7737 format. For now, the current solution is sufficient.
7738*/
7739#define FORMATBUFLEN (size_t)120
7740
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741PyObject *PyUnicode_Format(PyObject *format,
7742 PyObject *args)
7743{
7744 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007745 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 int args_owned = 0;
7747 PyUnicodeObject *result = NULL;
7748 PyObject *dict = NULL;
7749 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007750
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 if (format == NULL || args == NULL) {
7752 PyErr_BadInternalCall();
7753 return NULL;
7754 }
7755 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007756 if (uformat == NULL)
7757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 fmt = PyUnicode_AS_UNICODE(uformat);
7759 fmtcnt = PyUnicode_GET_SIZE(uformat);
7760
7761 reslen = rescnt = fmtcnt + 100;
7762 result = _PyUnicode_New(reslen);
7763 if (result == NULL)
7764 goto onError;
7765 res = PyUnicode_AS_UNICODE(result);
7766
7767 if (PyTuple_Check(args)) {
7768 arglen = PyTuple_Size(args);
7769 argidx = 0;
7770 }
7771 else {
7772 arglen = -1;
7773 argidx = -2;
7774 }
Martin v. Löwis68192102007-07-21 06:55:02 +00007775 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007776 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777 dict = args;
7778
7779 while (--fmtcnt >= 0) {
7780 if (*fmt != '%') {
7781 if (--rescnt < 0) {
7782 rescnt = fmtcnt + 100;
7783 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007784 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7787 --rescnt;
7788 }
7789 *res++ = *fmt++;
7790 }
7791 else {
7792 /* Got a format specifier */
7793 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007794 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 Py_UNICODE c = '\0';
7797 Py_UNICODE fill;
7798 PyObject *v = NULL;
7799 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007800 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007802 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007803 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804
7805 fmt++;
7806 if (*fmt == '(') {
7807 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007808 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809 PyObject *key;
7810 int pcount = 1;
7811
7812 if (dict == NULL) {
7813 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007814 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 goto onError;
7816 }
7817 ++fmt;
7818 --fmtcnt;
7819 keystart = fmt;
7820 /* Skip over balanced parentheses */
7821 while (pcount > 0 && --fmtcnt >= 0) {
7822 if (*fmt == ')')
7823 --pcount;
7824 else if (*fmt == '(')
7825 ++pcount;
7826 fmt++;
7827 }
7828 keylen = fmt - keystart - 1;
7829 if (fmtcnt < 0 || pcount > 0) {
7830 PyErr_SetString(PyExc_ValueError,
7831 "incomplete format key");
7832 goto onError;
7833 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007834#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007835 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 then looked up since Python uses strings to hold
7837 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007838 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 key = PyUnicode_EncodeUTF8(keystart,
7840 keylen,
7841 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007842#else
7843 key = PyUnicode_FromUnicode(keystart, keylen);
7844#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 if (key == NULL)
7846 goto onError;
7847 if (args_owned) {
7848 Py_DECREF(args);
7849 args_owned = 0;
7850 }
7851 args = PyObject_GetItem(dict, key);
7852 Py_DECREF(key);
7853 if (args == NULL) {
7854 goto onError;
7855 }
7856 args_owned = 1;
7857 arglen = -1;
7858 argidx = -2;
7859 }
7860 while (--fmtcnt >= 0) {
7861 switch (c = *fmt++) {
7862 case '-': flags |= F_LJUST; continue;
7863 case '+': flags |= F_SIGN; continue;
7864 case ' ': flags |= F_BLANK; continue;
7865 case '#': flags |= F_ALT; continue;
7866 case '0': flags |= F_ZERO; continue;
7867 }
7868 break;
7869 }
7870 if (c == '*') {
7871 v = getnextarg(args, arglen, &argidx);
7872 if (v == NULL)
7873 goto onError;
7874 if (!PyInt_Check(v)) {
7875 PyErr_SetString(PyExc_TypeError,
7876 "* wants int");
7877 goto onError;
7878 }
7879 width = PyInt_AsLong(v);
7880 if (width < 0) {
7881 flags |= F_LJUST;
7882 width = -width;
7883 }
7884 if (--fmtcnt >= 0)
7885 c = *fmt++;
7886 }
7887 else if (c >= '0' && c <= '9') {
7888 width = c - '0';
7889 while (--fmtcnt >= 0) {
7890 c = *fmt++;
7891 if (c < '0' || c > '9')
7892 break;
7893 if ((width*10) / 10 != width) {
7894 PyErr_SetString(PyExc_ValueError,
7895 "width too big");
7896 goto onError;
7897 }
7898 width = width*10 + (c - '0');
7899 }
7900 }
7901 if (c == '.') {
7902 prec = 0;
7903 if (--fmtcnt >= 0)
7904 c = *fmt++;
7905 if (c == '*') {
7906 v = getnextarg(args, arglen, &argidx);
7907 if (v == NULL)
7908 goto onError;
7909 if (!PyInt_Check(v)) {
7910 PyErr_SetString(PyExc_TypeError,
7911 "* wants int");
7912 goto onError;
7913 }
7914 prec = PyInt_AsLong(v);
7915 if (prec < 0)
7916 prec = 0;
7917 if (--fmtcnt >= 0)
7918 c = *fmt++;
7919 }
7920 else if (c >= '0' && c <= '9') {
7921 prec = c - '0';
7922 while (--fmtcnt >= 0) {
7923 c = Py_CHARMASK(*fmt++);
7924 if (c < '0' || c > '9')
7925 break;
7926 if ((prec*10) / 10 != prec) {
7927 PyErr_SetString(PyExc_ValueError,
7928 "prec too big");
7929 goto onError;
7930 }
7931 prec = prec*10 + (c - '0');
7932 }
7933 }
7934 } /* prec */
7935 if (fmtcnt >= 0) {
7936 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 if (--fmtcnt >= 0)
7938 c = *fmt++;
7939 }
7940 }
7941 if (fmtcnt < 0) {
7942 PyErr_SetString(PyExc_ValueError,
7943 "incomplete format");
7944 goto onError;
7945 }
7946 if (c != '%') {
7947 v = getnextarg(args, arglen, &argidx);
7948 if (v == NULL)
7949 goto onError;
7950 }
7951 sign = 0;
7952 fill = ' ';
7953 switch (c) {
7954
7955 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007956 pbuf = formatbuf;
7957 /* presume that buffer length is at least 1 */
7958 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 len = 1;
7960 break;
7961
7962 case 's':
7963 case 'r':
7964 if (PyUnicode_Check(v) && c == 's') {
7965 temp = v;
7966 Py_INCREF(temp);
7967 }
7968 else {
7969 PyObject *unicode;
7970 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007971 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 else
7973 temp = PyObject_Repr(v);
7974 if (temp == NULL)
7975 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007976 if (PyUnicode_Check(temp))
7977 /* nothing to do */;
7978 else if (PyString_Check(temp)) {
7979 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007980 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007982 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007984 Py_DECREF(temp);
7985 temp = unicode;
7986 if (temp == NULL)
7987 goto onError;
7988 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007989 else {
7990 Py_DECREF(temp);
7991 PyErr_SetString(PyExc_TypeError,
7992 "%s argument has non-string str()");
7993 goto onError;
7994 }
7995 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007996 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 len = PyUnicode_GET_SIZE(temp);
7998 if (prec >= 0 && len > prec)
7999 len = prec;
8000 break;
8001
8002 case 'i':
8003 case 'd':
8004 case 'u':
8005 case 'o':
8006 case 'x':
8007 case 'X':
8008 if (c == 'i')
8009 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008010 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008011 temp = formatlong(v, flags, prec, c);
8012 if (!temp)
8013 goto onError;
8014 pbuf = PyUnicode_AS_UNICODE(temp);
8015 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008016 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008018 else {
8019 pbuf = formatbuf;
8020 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8021 flags, prec, c, v);
8022 if (len < 0)
8023 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008024 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008025 }
8026 if (flags & F_ZERO)
8027 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 break;
8029
8030 case 'e':
8031 case 'E':
8032 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008033 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 case 'g':
8035 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008036 if (c == 'F')
8037 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008038 pbuf = formatbuf;
8039 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8040 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 if (len < 0)
8042 goto onError;
8043 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008044 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 fill = '0';
8046 break;
8047
8048 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008049 pbuf = formatbuf;
8050 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 if (len < 0)
8052 goto onError;
8053 break;
8054
8055 default:
8056 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008057 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008058 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008059 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008060 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008061 (Py_ssize_t)(fmt - 1 -
8062 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 goto onError;
8064 }
8065 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008066 if (*pbuf == '-' || *pbuf == '+') {
8067 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 len--;
8069 }
8070 else if (flags & F_SIGN)
8071 sign = '+';
8072 else if (flags & F_BLANK)
8073 sign = ' ';
8074 else
8075 sign = 0;
8076 }
8077 if (width < len)
8078 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008079 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 reslen -= rescnt;
8081 rescnt = width + fmtcnt + 100;
8082 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008083 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008084 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008085 PyErr_NoMemory();
8086 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008087 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008088 if (_PyUnicode_Resize(&result, reslen) < 0) {
8089 Py_XDECREF(temp);
8090 goto onError;
8091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 res = PyUnicode_AS_UNICODE(result)
8093 + reslen - rescnt;
8094 }
8095 if (sign) {
8096 if (fill != ' ')
8097 *res++ = sign;
8098 rescnt--;
8099 if (width > len)
8100 width--;
8101 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008102 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8103 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008104 assert(pbuf[1] == c);
8105 if (fill != ' ') {
8106 *res++ = *pbuf++;
8107 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008108 }
Tim Petersfff53252001-04-12 18:38:48 +00008109 rescnt -= 2;
8110 width -= 2;
8111 if (width < 0)
8112 width = 0;
8113 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 if (width > len && !(flags & F_LJUST)) {
8116 do {
8117 --rescnt;
8118 *res++ = fill;
8119 } while (--width > len);
8120 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008121 if (fill == ' ') {
8122 if (sign)
8123 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008124 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008125 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008126 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008127 *res++ = *pbuf++;
8128 *res++ = *pbuf++;
8129 }
8130 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008131 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 res += len;
8133 rescnt -= len;
8134 while (--width >= len) {
8135 --rescnt;
8136 *res++ = ' ';
8137 }
8138 if (dict && (argidx < arglen) && c != '%') {
8139 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008140 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008141 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 goto onError;
8143 }
8144 Py_XDECREF(temp);
8145 } /* '%' */
8146 } /* until end */
8147 if (argidx < arglen && !dict) {
8148 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008149 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 goto onError;
8151 }
8152
Thomas Woutersa96affe2006-03-12 00:29:36 +00008153 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8154 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 if (args_owned) {
8156 Py_DECREF(args);
8157 }
8158 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 return (PyObject *)result;
8160
8161 onError:
8162 Py_XDECREF(result);
8163 Py_DECREF(uformat);
8164 if (args_owned) {
8165 Py_DECREF(args);
8166 }
8167 return NULL;
8168}
8169
8170static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008171 (readbufferproc) unicode_buffer_getreadbuf,
8172 (writebufferproc) unicode_buffer_getwritebuf,
8173 (segcountproc) unicode_buffer_getsegcount,
8174 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175};
8176
Jeremy Hylton938ace62002-07-17 16:30:39 +00008177static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008178unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8179
Tim Peters6d6c1a32001-08-02 04:15:00 +00008180static PyObject *
8181unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8182{
8183 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008184 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008185 char *encoding = NULL;
8186 char *errors = NULL;
8187
Guido van Rossume023fe02001-08-30 03:12:59 +00008188 if (type != &PyUnicode_Type)
8189 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008190 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8191 kwlist, &x, &encoding, &errors))
8192 return NULL;
8193 if (x == NULL)
8194 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008195 if (encoding == NULL && errors == NULL)
8196 return PyObject_Unicode(x);
8197 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008198 return PyUnicode_FromEncodedObject(x, encoding, errors);
8199}
8200
Guido van Rossume023fe02001-08-30 03:12:59 +00008201static PyObject *
8202unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8203{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008204 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008205 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008206
8207 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8208 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8209 if (tmp == NULL)
8210 return NULL;
8211 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008212 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008213 if (pnew == NULL) {
8214 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008215 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008216 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008217 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8218 if (pnew->str == NULL) {
8219 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008220 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008221 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008222 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008223 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008224 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8225 pnew->length = n;
8226 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008227 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008228 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008229}
8230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008231PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008232"unicode(string [, encoding[, errors]]) -> object\n\
8233\n\
8234Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008235encoding defaults to the current default string encoding.\n\
8236errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008237
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008239 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240 "unicode", /* tp_name */
8241 sizeof(PyUnicodeObject), /* tp_size */
8242 0, /* tp_itemsize */
8243 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008244 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008246 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008248 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008249 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008250 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008252 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 (hashfunc) unicode_hash, /* tp_hash*/
8254 0, /* tp_call*/
8255 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008256 PyObject_GenericGetAttr, /* tp_getattro */
8257 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008259 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008260 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008261 unicode_doc, /* tp_doc */
8262 0, /* tp_traverse */
8263 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008264 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008265 0, /* tp_weaklistoffset */
8266 0, /* tp_iter */
8267 0, /* tp_iternext */
8268 unicode_methods, /* tp_methods */
8269 0, /* tp_members */
8270 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008271 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008272 0, /* tp_dict */
8273 0, /* tp_descr_get */
8274 0, /* tp_descr_set */
8275 0, /* tp_dictoffset */
8276 0, /* tp_init */
8277 0, /* tp_alloc */
8278 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008279 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280};
8281
8282/* Initialize the Unicode implementation */
8283
Thomas Wouters78890102000-07-22 19:25:51 +00008284void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008286 int i;
8287
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008288 /* XXX - move this array to unicodectype.c ? */
8289 Py_UNICODE linebreak[] = {
8290 0x000A, /* LINE FEED */
8291 0x000D, /* CARRIAGE RETURN */
8292 0x001C, /* FILE SEPARATOR */
8293 0x001D, /* GROUP SEPARATOR */
8294 0x001E, /* RECORD SEPARATOR */
8295 0x0085, /* NEXT LINE */
8296 0x2028, /* LINE SEPARATOR */
8297 0x2029, /* PARAGRAPH SEPARATOR */
8298 };
8299
Fred Drakee4315f52000-05-09 19:53:39 +00008300 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008301 unicode_freelist = NULL;
8302 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008304 if (!unicode_empty)
8305 return;
8306
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008307 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008308 for (i = 0; i < 256; i++)
8309 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008310 if (PyType_Ready(&PyUnicode_Type) < 0)
8311 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008312
8313 /* initialize the linebreak bloom filter */
8314 bloom_linebreak = make_bloom_mask(
8315 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8316 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008317
8318 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319}
8320
8321/* Finalize the Unicode implementation */
8322
8323void
Thomas Wouters78890102000-07-22 19:25:51 +00008324_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008326 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008327 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008329 Py_XDECREF(unicode_empty);
8330 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008331
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008332 for (i = 0; i < 256; i++) {
8333 if (unicode_latin1[i]) {
8334 Py_DECREF(unicode_latin1[i]);
8335 unicode_latin1[i] = NULL;
8336 }
8337 }
8338
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008339 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 PyUnicodeObject *v = u;
8341 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008342 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008343 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008344 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008345 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008347 unicode_freelist = NULL;
8348 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008350
Anthony Baxterac6bd462006-04-13 02:06:09 +00008351#ifdef __cplusplus
8352}
8353#endif
8354
8355
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008356/*
8357Local variables:
8358c-basic-offset: 4
8359indent-tabs-mode: nil
8360End:
8361*/