blob: ab638350ee44f3e53694db0884a240ce05894c85 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundh95e2a912006-05-26 11:38:15 +0000144Py_LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundh95e2a912006-05-26 11:38:15 +0000158Py_LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without makeing any assumptions about what
200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Tim Petersced69f82003-09-16 20:30:58 +0000236 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002018Py_LOCAL(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2044 if (repr == NULL)
2045 return NULL;
2046
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002047 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048
2049 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002051 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 !findchar(s, size, '"')) ? '"' : '\'';
2053 }
2054 while (size-- > 0) {
2055 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002056
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002057 /* Escape quotes and backslashes */
2058 if ((quotes &&
2059 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 *p++ = '\\';
2061 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002062 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002065#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002066 /* Map 21-bit characters to '\U00xxxxxx' */
2067 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002068 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002069
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002070 /* Resize the string if necessary */
2071 if (offset + 12 > PyString_GET_SIZE(repr)) {
2072 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002073 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074 p = PyString_AS_STRING(repr) + offset;
2075 }
2076
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002077 *p++ = '\\';
2078 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2080 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2081 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2082 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2083 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2084 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2085 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002086 *p++ = hexdigit[ch & 0x0000000F];
2087 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002088 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002089#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002090 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091 else if (ch >= 0xD800 && ch < 0xDC00) {
2092 Py_UNICODE ch2;
2093 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002094
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 ch2 = *s++;
2096 size--;
2097 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2098 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2099 *p++ = '\\';
2100 *p++ = 'U';
2101 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2102 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2103 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2104 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2105 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2106 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2107 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2108 *p++ = hexdigit[ucs & 0x0000000F];
2109 continue;
2110 }
2111 /* Fall through: isolated surrogates are copied as-is */
2112 s--;
2113 size++;
2114 }
2115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002117 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 *p++ = '\\';
2119 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002120 *p++ = hexdigit[(ch >> 12) & 0x000F];
2121 *p++ = hexdigit[(ch >> 8) & 0x000F];
2122 *p++ = hexdigit[(ch >> 4) & 0x000F];
2123 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002125
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002126 /* Map special whitespace to '\t', \n', '\r' */
2127 else if (ch == '\t') {
2128 *p++ = '\\';
2129 *p++ = 't';
2130 }
2131 else if (ch == '\n') {
2132 *p++ = '\\';
2133 *p++ = 'n';
2134 }
2135 else if (ch == '\r') {
2136 *p++ = '\\';
2137 *p++ = 'r';
2138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002140 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002141 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144 *p++ = hexdigit[(ch >> 4) & 0x000F];
2145 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002146 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Copy everything else as-is */
2149 else
2150 *p++ = (char) ch;
2151 }
2152 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002153 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154
2155 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002156 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 return repr;
2158}
2159
2160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002161 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162{
2163 return unicodeescape_string(s, size, 0);
2164}
2165
2166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2167{
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2171 }
2172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode));
2174}
2175
2176/* --- Raw Unicode Escape Codec ------------------------------------------- */
2177
2178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002179 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 const char *errors)
2181{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002183 Py_ssize_t startinpos;
2184 Py_ssize_t endinpos;
2185 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 const char *end;
2189 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002190 PyObject *errorHandler = NULL;
2191 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002192
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 /* Escaped strings will always be longer than the resulting
2194 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 length after conversion to the true value. (But decoding error
2196 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 v = _PyUnicode_New(size);
2198 if (v == NULL)
2199 goto onError;
2200 if (size == 0)
2201 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 end = s + size;
2204 while (s < end) {
2205 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002206 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002208 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
2210 /* Non-escape characters are interpreted as Unicode ordinals */
2211 if (*s != '\\') {
2212 *p++ = (unsigned char)*s++;
2213 continue;
2214 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 /* \u-escapes are only interpreted iff the number of leading
2218 backslashes if odd */
2219 bs = s;
2220 for (;s < end;) {
2221 if (*s != '\\')
2222 break;
2223 *p++ = (unsigned char)*s++;
2224 }
2225 if (((s - bs) & 1) == 0 ||
2226 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 continue;
2229 }
2230 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 s++;
2233
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002234 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002236 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002237 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239 endinpos = s-starts;
2240 if (unicode_decode_call_errorhandler(
2241 errors, &errorHandler,
2242 "rawunicodeescape", "truncated \\uXXXX",
2243 starts, size, &startinpos, &endinpos, &exc, &s,
2244 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 }
2248 x = (x<<4) & ~0xF;
2249 if (c >= '0' && c <= '9')
2250 x += c - '0';
2251 else if (c >= 'a' && c <= 'f')
2252 x += 10 + c - 'a';
2253 else
2254 x += 10 + c - 'A';
2255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256#ifndef Py_UNICODE_WIDE
2257 if (x > 0x10000) {
2258 if (unicode_decode_call_errorhandler(
2259 errors, &errorHandler,
2260 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261 starts, size, &startinpos, &endinpos, &exc, &s,
2262 (PyObject **)&v, &outpos, &p))
2263 goto onError;
2264 }
2265#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 *p++ = x;
2267 nextByte:
2268 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002270 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 onError:
2277 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 Py_XDECREF(errorHandler);
2279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 return NULL;
2281}
2282
2283PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002284 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
2286 PyObject *repr;
2287 char *p;
2288 char *q;
2289
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002290 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002292#ifdef Py_UNICODE_WIDE
2293 repr = PyString_FromStringAndSize(NULL, 10 * size);
2294#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002296#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 if (repr == NULL)
2298 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002299 if (size == 0)
2300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 p = q = PyString_AS_STRING(repr);
2303 while (size-- > 0) {
2304 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305#ifdef Py_UNICODE_WIDE
2306 /* Map 32-bit characters to '\Uxxxxxxxx' */
2307 if (ch >= 0x10000) {
2308 *p++ = '\\';
2309 *p++ = 'U';
2310 *p++ = hexdigit[(ch >> 28) & 0xf];
2311 *p++ = hexdigit[(ch >> 24) & 0xf];
2312 *p++ = hexdigit[(ch >> 20) & 0xf];
2313 *p++ = hexdigit[(ch >> 16) & 0xf];
2314 *p++ = hexdigit[(ch >> 12) & 0xf];
2315 *p++ = hexdigit[(ch >> 8) & 0xf];
2316 *p++ = hexdigit[(ch >> 4) & 0xf];
2317 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002318 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002319 else
2320#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 /* Map 16-bit characters to '\uxxxx' */
2322 if (ch >= 256) {
2323 *p++ = '\\';
2324 *p++ = 'u';
2325 *p++ = hexdigit[(ch >> 12) & 0xf];
2326 *p++ = hexdigit[(ch >> 8) & 0xf];
2327 *p++ = hexdigit[(ch >> 4) & 0xf];
2328 *p++ = hexdigit[ch & 15];
2329 }
2330 /* Copy everything else as-is */
2331 else
2332 *p++ = (char) ch;
2333 }
2334 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002335 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 return repr;
2337}
2338
2339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2340{
2341 if (!PyUnicode_Check(unicode)) {
2342 PyErr_BadArgument();
2343 return NULL;
2344 }
2345 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2346 PyUnicode_GET_SIZE(unicode));
2347}
2348
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002349/* --- Unicode Internal Codec ------------------------------------------- */
2350
2351PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 const char *errors)
2354{
2355 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002356 Py_ssize_t startinpos;
2357 Py_ssize_t endinpos;
2358 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002359 PyUnicodeObject *v;
2360 Py_UNICODE *p;
2361 const char *end;
2362 const char *reason;
2363 PyObject *errorHandler = NULL;
2364 PyObject *exc = NULL;
2365
Neal Norwitzd43069c2006-01-08 01:12:10 +00002366#ifdef Py_UNICODE_WIDE
2367 Py_UNICODE unimax = PyUnicode_GetMax();
2368#endif
2369
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2371 if (v == NULL)
2372 goto onError;
2373 if (PyUnicode_GetSize((PyObject *)v) == 0)
2374 return (PyObject *)v;
2375 p = PyUnicode_AS_UNICODE(v);
2376 end = s + size;
2377
2378 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002379 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002380 /* We have to sanity check the raw data, otherwise doom looms for
2381 some malformed UCS-4 data. */
2382 if (
2383 #ifdef Py_UNICODE_WIDE
2384 *p > unimax || *p < 0 ||
2385 #endif
2386 end-s < Py_UNICODE_SIZE
2387 )
2388 {
2389 startinpos = s - starts;
2390 if (end-s < Py_UNICODE_SIZE) {
2391 endinpos = end-starts;
2392 reason = "truncated input";
2393 }
2394 else {
2395 endinpos = s - starts + Py_UNICODE_SIZE;
2396 reason = "illegal code point (> 0x10FFFF)";
2397 }
2398 outpos = p - PyUnicode_AS_UNICODE(v);
2399 if (unicode_decode_call_errorhandler(
2400 errors, &errorHandler,
2401 "unicode_internal", reason,
2402 starts, size, &startinpos, &endinpos, &exc, &s,
2403 (PyObject **)&v, &outpos, &p)) {
2404 goto onError;
2405 }
2406 }
2407 else {
2408 p++;
2409 s += Py_UNICODE_SIZE;
2410 }
2411 }
2412
Martin v. Löwis412fb672006-04-13 06:34:32 +00002413 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002414 goto onError;
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return (PyObject *)v;
2418
2419 onError:
2420 Py_XDECREF(v);
2421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
2423 return NULL;
2424}
2425
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426/* --- Latin-1 Codec ------------------------------------------------------ */
2427
2428PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002429 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430 const char *errors)
2431{
2432 PyUnicodeObject *v;
2433 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002436 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002437 Py_UNICODE r = *(unsigned char*)s;
2438 return PyUnicode_FromUnicode(&r, 1);
2439 }
2440
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 v = _PyUnicode_New(size);
2442 if (v == NULL)
2443 goto onError;
2444 if (size == 0)
2445 return (PyObject *)v;
2446 p = PyUnicode_AS_UNICODE(v);
2447 while (size-- > 0)
2448 *p++ = (unsigned char)*s++;
2449 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 onError:
2452 Py_XDECREF(v);
2453 return NULL;
2454}
2455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002456/* create or adjust a UnicodeEncodeError */
2457static void make_encode_exception(PyObject **exceptionObject,
2458 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002459 const Py_UNICODE *unicode, Py_ssize_t size,
2460 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002461 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 if (*exceptionObject == NULL) {
2464 *exceptionObject = PyUnicodeEncodeError_Create(
2465 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 }
2467 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2469 goto onError;
2470 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2471 goto onError;
2472 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2473 goto onError;
2474 return;
2475 onError:
2476 Py_DECREF(*exceptionObject);
2477 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 }
2479}
2480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481/* raises a UnicodeEncodeError */
2482static void raise_encode_exception(PyObject **exceptionObject,
2483 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002484 const Py_UNICODE *unicode, Py_ssize_t size,
2485 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 const char *reason)
2487{
2488 make_encode_exception(exceptionObject,
2489 encoding, unicode, size, startpos, endpos, reason);
2490 if (*exceptionObject != NULL)
2491 PyCodec_StrictErrors(*exceptionObject);
2492}
2493
2494/* error handling callback helper:
2495 build arguments, call the callback and check the arguments,
2496 put the result into newpos and return the replacement string, which
2497 has to be freed by the caller */
2498static PyObject *unicode_encode_call_errorhandler(const char *errors,
2499 PyObject **errorHandler,
2500 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2502 Py_ssize_t startpos, Py_ssize_t endpos,
2503 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002505 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002506
2507 PyObject *restuple;
2508 PyObject *resunicode;
2509
2510 if (*errorHandler == NULL) {
2511 *errorHandler = PyCodec_LookupError(errors);
2512 if (*errorHandler == NULL)
2513 return NULL;
2514 }
2515
2516 make_encode_exception(exceptionObject,
2517 encoding, unicode, size, startpos, endpos, reason);
2518 if (*exceptionObject == NULL)
2519 return NULL;
2520
2521 restuple = PyObject_CallFunctionObjArgs(
2522 *errorHandler, *exceptionObject, NULL);
2523 if (restuple == NULL)
2524 return NULL;
2525 if (!PyTuple_Check(restuple)) {
2526 PyErr_Format(PyExc_TypeError, &argparse[4]);
2527 Py_DECREF(restuple);
2528 return NULL;
2529 }
2530 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2531 &resunicode, newpos)) {
2532 Py_DECREF(restuple);
2533 return NULL;
2534 }
2535 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002536 *newpos = size+*newpos;
2537 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002538 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002539 Py_DECREF(restuple);
2540 return NULL;
2541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002542 Py_INCREF(resunicode);
2543 Py_DECREF(restuple);
2544 return resunicode;
2545}
2546
2547static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002548 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 const char *errors,
2550 int limit)
2551{
2552 /* output object */
2553 PyObject *res;
2554 /* pointers to the beginning and end+1 of input */
2555 const Py_UNICODE *startp = p;
2556 const Py_UNICODE *endp = p + size;
2557 /* pointer to the beginning of the unencodable characters */
2558 /* const Py_UNICODE *badp = NULL; */
2559 /* pointer into the output */
2560 char *str;
2561 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002562 Py_ssize_t respos = 0;
2563 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002564 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2565 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 PyObject *errorHandler = NULL;
2567 PyObject *exc = NULL;
2568 /* the following variable is used for caching string comparisons
2569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2570 int known_errorHandler = -1;
2571
2572 /* allocate enough for a simple encoding without
2573 replacements, if we need more, we'll resize */
2574 res = PyString_FromStringAndSize(NULL, size);
2575 if (res == NULL)
2576 goto onError;
2577 if (size == 0)
2578 return res;
2579 str = PyString_AS_STRING(res);
2580 ressize = size;
2581
2582 while (p<endp) {
2583 Py_UNICODE c = *p;
2584
2585 /* can we encode this? */
2586 if (c<limit) {
2587 /* no overflow check, because we know that the space is enough */
2588 *str++ = (char)c;
2589 ++p;
2590 }
2591 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002592 Py_ssize_t unicodepos = p-startp;
2593 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002595 Py_ssize_t repsize;
2596 Py_ssize_t newpos;
2597 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002598 Py_UNICODE *uni2;
2599 /* startpos for collecting unencodable chars */
2600 const Py_UNICODE *collstart = p;
2601 const Py_UNICODE *collend = p;
2602 /* find all unecodable characters */
2603 while ((collend < endp) && ((*collend)>=limit))
2604 ++collend;
2605 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2606 if (known_errorHandler==-1) {
2607 if ((errors==NULL) || (!strcmp(errors, "strict")))
2608 known_errorHandler = 1;
2609 else if (!strcmp(errors, "replace"))
2610 known_errorHandler = 2;
2611 else if (!strcmp(errors, "ignore"))
2612 known_errorHandler = 3;
2613 else if (!strcmp(errors, "xmlcharrefreplace"))
2614 known_errorHandler = 4;
2615 else
2616 known_errorHandler = 0;
2617 }
2618 switch (known_errorHandler) {
2619 case 1: /* strict */
2620 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2621 goto onError;
2622 case 2: /* replace */
2623 while (collstart++<collend)
2624 *str++ = '?'; /* fall through */
2625 case 3: /* ignore */
2626 p = collend;
2627 break;
2628 case 4: /* xmlcharrefreplace */
2629 respos = str-PyString_AS_STRING(res);
2630 /* determine replacement size (temporarily (mis)uses p) */
2631 for (p = collstart, repsize = 0; p < collend; ++p) {
2632 if (*p<10)
2633 repsize += 2+1+1;
2634 else if (*p<100)
2635 repsize += 2+2+1;
2636 else if (*p<1000)
2637 repsize += 2+3+1;
2638 else if (*p<10000)
2639 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002640#ifndef Py_UNICODE_WIDE
2641 else
2642 repsize += 2+5+1;
2643#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002644 else if (*p<100000)
2645 repsize += 2+5+1;
2646 else if (*p<1000000)
2647 repsize += 2+6+1;
2648 else
2649 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002650#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 }
2652 requiredsize = respos+repsize+(endp-collend);
2653 if (requiredsize > ressize) {
2654 if (requiredsize<2*ressize)
2655 requiredsize = 2*ressize;
2656 if (_PyString_Resize(&res, requiredsize))
2657 goto onError;
2658 str = PyString_AS_STRING(res) + respos;
2659 ressize = requiredsize;
2660 }
2661 /* generate replacement (temporarily (mis)uses p) */
2662 for (p = collstart; p < collend; ++p) {
2663 str += sprintf(str, "&#%d;", (int)*p);
2664 }
2665 p = collend;
2666 break;
2667 default:
2668 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2669 encoding, reason, startp, size, &exc,
2670 collstart-startp, collend-startp, &newpos);
2671 if (repunicode == NULL)
2672 goto onError;
2673 /* need more space? (at least enough for what we
2674 have+the replacement+the rest of the string, so
2675 we won't have to check space for encodable characters) */
2676 respos = str-PyString_AS_STRING(res);
2677 repsize = PyUnicode_GET_SIZE(repunicode);
2678 requiredsize = respos+repsize+(endp-collend);
2679 if (requiredsize > ressize) {
2680 if (requiredsize<2*ressize)
2681 requiredsize = 2*ressize;
2682 if (_PyString_Resize(&res, requiredsize)) {
2683 Py_DECREF(repunicode);
2684 goto onError;
2685 }
2686 str = PyString_AS_STRING(res) + respos;
2687 ressize = requiredsize;
2688 }
2689 /* check if there is anything unencodable in the replacement
2690 and copy it to the output */
2691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2692 c = *uni2;
2693 if (c >= limit) {
2694 raise_encode_exception(&exc, encoding, startp, size,
2695 unicodepos, unicodepos+1, reason);
2696 Py_DECREF(repunicode);
2697 goto onError;
2698 }
2699 *str = (char)c;
2700 }
2701 p = startp + newpos;
2702 Py_DECREF(repunicode);
2703 }
2704 }
2705 }
2706 /* Resize if we allocated to much */
2707 respos = str-PyString_AS_STRING(res);
2708 if (respos<ressize)
2709 /* If this falls res will be NULL */
2710 _PyString_Resize(&res, respos);
2711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
2713 return res;
2714
2715 onError:
2716 Py_XDECREF(res);
2717 Py_XDECREF(errorHandler);
2718 Py_XDECREF(exc);
2719 return NULL;
2720}
2721
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002723 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 const char *errors)
2725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002726 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727}
2728
2729PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2730{
2731 if (!PyUnicode_Check(unicode)) {
2732 PyErr_BadArgument();
2733 return NULL;
2734 }
2735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2736 PyUnicode_GET_SIZE(unicode),
2737 NULL);
2738}
2739
2740/* --- 7-bit ASCII Codec -------------------------------------------------- */
2741
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 const char *errors)
2745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 PyUnicodeObject *v;
2748 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002749 Py_ssize_t startinpos;
2750 Py_ssize_t endinpos;
2751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 const char *e;
2753 PyObject *errorHandler = NULL;
2754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002757 if (size == 1 && *(unsigned char*)s < 128) {
2758 Py_UNICODE r = *(unsigned char*)s;
2759 return PyUnicode_FromUnicode(&r, 1);
2760 }
Tim Petersced69f82003-09-16 20:30:58 +00002761
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 v = _PyUnicode_New(size);
2763 if (v == NULL)
2764 goto onError;
2765 if (size == 0)
2766 return (PyObject *)v;
2767 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002768 e = s + size;
2769 while (s < e) {
2770 register unsigned char c = (unsigned char)*s;
2771 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002772 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 ++s;
2774 }
2775 else {
2776 startinpos = s-starts;
2777 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002778 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 if (unicode_decode_call_errorhandler(
2780 errors, &errorHandler,
2781 "ascii", "ordinal not in range(128)",
2782 starts, size, &startinpos, &endinpos, &exc, &s,
2783 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002787 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002788 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002789 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002793
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 onError:
2795 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002796 Py_XDECREF(errorHandler);
2797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 return NULL;
2799}
2800
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002802 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 const char *errors)
2804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002805 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806}
2807
2808PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2809{
2810 if (!PyUnicode_Check(unicode)) {
2811 PyErr_BadArgument();
2812 return NULL;
2813 }
2814 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2815 PyUnicode_GET_SIZE(unicode),
2816 NULL);
2817}
2818
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002819#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002820
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002821/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002822
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002823PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002824 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002825 const char *errors)
2826{
2827 PyUnicodeObject *v;
2828 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002829 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002830
2831 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002832 assert(size < INT_MAX);
2833 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002834 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002835 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2836
2837 v = _PyUnicode_New(usize);
2838 if (v == NULL)
2839 return NULL;
2840 if (usize == 0)
2841 return (PyObject *)v;
2842 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002843 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002844 Py_DECREF(v);
2845 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2846 }
2847
2848 return (PyObject *)v;
2849}
2850
2851PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002852 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002853 const char *errors)
2854{
2855 PyObject *repr;
2856 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002857 DWORD mbcssize;
2858
2859 /* If there are no characters, bail now! */
2860 if (size==0)
2861 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002862
2863 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002864 assert(size<INT_MAX);
2865 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002866 if (mbcssize==0)
2867 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2868
2869 repr = PyString_FromStringAndSize(NULL, mbcssize);
2870 if (repr == NULL)
2871 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002872 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002873 return repr;
2874
2875 /* Do the conversion */
2876 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002877 assert(size < INT_MAX);
2878 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002879 Py_DECREF(repr);
2880 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2881 }
2882 return repr;
2883}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002884
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002885PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2886{
2887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
2889 return NULL;
2890 }
2891 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2892 PyUnicode_GET_SIZE(unicode),
2893 NULL);
2894}
2895
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002896#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002897
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898/* --- Character Mapping Codec -------------------------------------------- */
2899
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002901 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 PyObject *mapping,
2903 const char *errors)
2904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002905 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t startinpos;
2907 Py_ssize_t endinpos;
2908 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 PyUnicodeObject *v;
2911 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002912 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 PyObject *errorHandler = NULL;
2914 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002915 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002916 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002917
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 /* Default to Latin-1 */
2919 if (mapping == NULL)
2920 return PyUnicode_DecodeLatin1(s, size, errors);
2921
2922 v = _PyUnicode_New(size);
2923 if (v == NULL)
2924 goto onError;
2925 if (size == 0)
2926 return (PyObject *)v;
2927 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002928 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002929 if (PyUnicode_CheckExact(mapping)) {
2930 mapstring = PyUnicode_AS_UNICODE(mapping);
2931 maplen = PyUnicode_GET_SIZE(mapping);
2932 while (s < e) {
2933 unsigned char ch = *s;
2934 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002936 if (ch < maplen)
2937 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002939 if (x == 0xfffe) {
2940 /* undefined mapping */
2941 outpos = p-PyUnicode_AS_UNICODE(v);
2942 startinpos = s-starts;
2943 endinpos = startinpos+1;
2944 if (unicode_decode_call_errorhandler(
2945 errors, &errorHandler,
2946 "charmap", "character maps to <undefined>",
2947 starts, size, &startinpos, &endinpos, &exc, &s,
2948 (PyObject **)&v, &outpos, &p)) {
2949 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002950 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002951 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002952 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002953 *p++ = x;
2954 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002956 }
2957 else {
2958 while (s < e) {
2959 unsigned char ch = *s;
2960 PyObject *w, *x;
2961
2962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2963 w = PyInt_FromLong((long)ch);
2964 if (w == NULL)
2965 goto onError;
2966 x = PyObject_GetItem(mapping, w);
2967 Py_DECREF(w);
2968 if (x == NULL) {
2969 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2970 /* No mapping found means: mapping is undefined. */
2971 PyErr_Clear();
2972 x = Py_None;
2973 Py_INCREF(x);
2974 } else
2975 goto onError;
2976 }
2977
2978 /* Apply mapping */
2979 if (PyInt_Check(x)) {
2980 long value = PyInt_AS_LONG(x);
2981 if (value < 0 || value > 65535) {
2982 PyErr_SetString(PyExc_TypeError,
2983 "character mapping must be in range(65536)");
2984 Py_DECREF(x);
2985 goto onError;
2986 }
2987 *p++ = (Py_UNICODE)value;
2988 }
2989 else if (x == Py_None) {
2990 /* undefined mapping */
2991 outpos = p-PyUnicode_AS_UNICODE(v);
2992 startinpos = s-starts;
2993 endinpos = startinpos+1;
2994 if (unicode_decode_call_errorhandler(
2995 errors, &errorHandler,
2996 "charmap", "character maps to <undefined>",
2997 starts, size, &startinpos, &endinpos, &exc, &s,
2998 (PyObject **)&v, &outpos, &p)) {
2999 Py_DECREF(x);
3000 goto onError;
3001 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003002 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003003 continue;
3004 }
3005 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003006 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003007
3008 if (targetsize == 1)
3009 /* 1-1 mapping */
3010 *p++ = *PyUnicode_AS_UNICODE(x);
3011
3012 else if (targetsize > 1) {
3013 /* 1-n mapping */
3014 if (targetsize > extrachars) {
3015 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003016 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3017 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003018 (targetsize << 2);
3019 extrachars += needed;
3020 if (_PyUnicode_Resize(&v,
3021 PyUnicode_GET_SIZE(v) + needed) < 0) {
3022 Py_DECREF(x);
3023 goto onError;
3024 }
3025 p = PyUnicode_AS_UNICODE(v) + oldpos;
3026 }
3027 Py_UNICODE_COPY(p,
3028 PyUnicode_AS_UNICODE(x),
3029 targetsize);
3030 p += targetsize;
3031 extrachars -= targetsize;
3032 }
3033 /* 1-0 mapping: skip the character */
3034 }
3035 else {
3036 /* wrong return value */
3037 PyErr_SetString(PyExc_TypeError,
3038 "character mapping must return integer, None or unicode");
3039 Py_DECREF(x);
3040 goto onError;
3041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003043 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 }
3046 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003047 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049 Py_XDECREF(errorHandler);
3050 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 Py_XDECREF(errorHandler);
3055 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 Py_XDECREF(v);
3057 return NULL;
3058}
3059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060/* Lookup the character ch in the mapping. If the character
3061 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003062 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003063static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065 PyObject *w = PyInt_FromLong((long)c);
3066 PyObject *x;
3067
3068 if (w == NULL)
3069 return NULL;
3070 x = PyObject_GetItem(mapping, w);
3071 Py_DECREF(w);
3072 if (x == NULL) {
3073 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3074 /* No mapping found means: mapping is undefined. */
3075 PyErr_Clear();
3076 x = Py_None;
3077 Py_INCREF(x);
3078 return x;
3079 } else
3080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003082 else if (x == Py_None)
3083 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003084 else if (PyInt_Check(x)) {
3085 long value = PyInt_AS_LONG(x);
3086 if (value < 0 || value > 255) {
3087 PyErr_SetString(PyExc_TypeError,
3088 "character mapping must be in range(256)");
3089 Py_DECREF(x);
3090 return NULL;
3091 }
3092 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 else if (PyString_Check(x))
3095 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003097 /* wrong return value */
3098 PyErr_SetString(PyExc_TypeError,
3099 "character mapping must return integer, None or str");
3100 Py_DECREF(x);
3101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
3103}
3104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105/* lookup the character, put the result in the output string and adjust
3106 various state variables. Reallocate the output string if not enough
3107 space is available. Return a new reference to the object that
3108 was put in the output buffer, or Py_None, if the mapping was undefined
3109 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003110 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003111static
3112PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003113 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114{
3115 PyObject *rep = charmapencode_lookup(c, mapping);
3116
3117 if (rep==NULL)
3118 return NULL;
3119 else if (rep==Py_None)
3120 return rep;
3121 else {
3122 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003123 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003124 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003125 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 if (outsize<requiredsize) {
3127 /* exponentially overallocate to minimize reallocations */
3128 if (requiredsize < 2*outsize)
3129 requiredsize = 2*outsize;
3130 if (_PyString_Resize(outobj, requiredsize)) {
3131 Py_DECREF(rep);
3132 return NULL;
3133 }
3134 outstart = PyString_AS_STRING(*outobj);
3135 }
3136 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3137 }
3138 else {
3139 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003140 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3141 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003142 if (outsize<requiredsize) {
3143 /* exponentially overallocate to minimize reallocations */
3144 if (requiredsize < 2*outsize)
3145 requiredsize = 2*outsize;
3146 if (_PyString_Resize(outobj, requiredsize)) {
3147 Py_DECREF(rep);
3148 return NULL;
3149 }
3150 outstart = PyString_AS_STRING(*outobj);
3151 }
3152 memcpy(outstart + *outpos, repchars, repsize);
3153 *outpos += repsize;
3154 }
3155 }
3156 return rep;
3157}
3158
3159/* handle an error in PyUnicode_EncodeCharmap
3160 Return 0 on success, -1 on error */
3161static
3162int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003163 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003165 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003166 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167{
3168 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003169 Py_ssize_t repsize;
3170 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003171 Py_UNICODE *uni2;
3172 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003173 Py_ssize_t collstartpos = *inpos;
3174 Py_ssize_t collendpos = *inpos+1;
3175 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176 char *encoding = "charmap";
3177 char *reason = "character maps to <undefined>";
3178
3179 PyObject *x;
3180 /* find all unencodable characters */
3181 while (collendpos < size) {
3182 x = charmapencode_lookup(p[collendpos], mapping);
3183 if (x==NULL)
3184 return -1;
3185 else if (x!=Py_None) {
3186 Py_DECREF(x);
3187 break;
3188 }
3189 Py_DECREF(x);
3190 ++collendpos;
3191 }
3192 /* cache callback name lookup
3193 * (if not done yet, i.e. it's the first error) */
3194 if (*known_errorHandler==-1) {
3195 if ((errors==NULL) || (!strcmp(errors, "strict")))
3196 *known_errorHandler = 1;
3197 else if (!strcmp(errors, "replace"))
3198 *known_errorHandler = 2;
3199 else if (!strcmp(errors, "ignore"))
3200 *known_errorHandler = 3;
3201 else if (!strcmp(errors, "xmlcharrefreplace"))
3202 *known_errorHandler = 4;
3203 else
3204 *known_errorHandler = 0;
3205 }
3206 switch (*known_errorHandler) {
3207 case 1: /* strict */
3208 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3209 return -1;
3210 case 2: /* replace */
3211 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3212 x = charmapencode_output('?', mapping, res, respos);
3213 if (x==NULL) {
3214 return -1;
3215 }
3216 else if (x==Py_None) {
3217 Py_DECREF(x);
3218 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3219 return -1;
3220 }
3221 Py_DECREF(x);
3222 }
3223 /* fall through */
3224 case 3: /* ignore */
3225 *inpos = collendpos;
3226 break;
3227 case 4: /* xmlcharrefreplace */
3228 /* generate replacement (temporarily (mis)uses p) */
3229 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3230 char buffer[2+29+1+1];
3231 char *cp;
3232 sprintf(buffer, "&#%d;", (int)p[collpos]);
3233 for (cp = buffer; *cp; ++cp) {
3234 x = charmapencode_output(*cp, mapping, res, respos);
3235 if (x==NULL)
3236 return -1;
3237 else if (x==Py_None) {
3238 Py_DECREF(x);
3239 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3240 return -1;
3241 }
3242 Py_DECREF(x);
3243 }
3244 }
3245 *inpos = collendpos;
3246 break;
3247 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003248 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 encoding, reason, p, size, exceptionObject,
3250 collstartpos, collendpos, &newpos);
3251 if (repunicode == NULL)
3252 return -1;
3253 /* generate replacement */
3254 repsize = PyUnicode_GET_SIZE(repunicode);
3255 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3256 x = charmapencode_output(*uni2, mapping, res, respos);
3257 if (x==NULL) {
3258 Py_DECREF(repunicode);
3259 return -1;
3260 }
3261 else if (x==Py_None) {
3262 Py_DECREF(repunicode);
3263 Py_DECREF(x);
3264 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3265 return -1;
3266 }
3267 Py_DECREF(x);
3268 }
3269 *inpos = newpos;
3270 Py_DECREF(repunicode);
3271 }
3272 return 0;
3273}
3274
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003276 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 PyObject *mapping,
3278 const char *errors)
3279{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 /* output object */
3281 PyObject *res = NULL;
3282 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003283 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003285 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286 PyObject *errorHandler = NULL;
3287 PyObject *exc = NULL;
3288 /* the following variable is used for caching string comparisons
3289 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3290 * 3=ignore, 4=xmlcharrefreplace */
3291 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292
3293 /* Default to Latin-1 */
3294 if (mapping == NULL)
3295 return PyUnicode_EncodeLatin1(p, size, errors);
3296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 /* allocate enough for a simple encoding without
3298 replacements, if we need more, we'll resize */
3299 res = PyString_FromStringAndSize(NULL, size);
3300 if (res == NULL)
3301 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003302 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003305 while (inpos<size) {
3306 /* try to encode it */
3307 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3308 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 if (x==Py_None) { /* unencodable character */
3311 if (charmap_encoding_error(p, size, &inpos, mapping,
3312 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003313 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003314 &res, &respos)) {
3315 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003316 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 else
3320 /* done with this character => adjust input position */
3321 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322 Py_DECREF(x);
3323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325 /* Resize if we allocated to much */
3326 if (respos<PyString_GET_SIZE(res)) {
3327 if (_PyString_Resize(&res, respos))
3328 goto onError;
3329 }
3330 Py_XDECREF(exc);
3331 Py_XDECREF(errorHandler);
3332 return res;
3333
3334 onError:
3335 Py_XDECREF(res);
3336 Py_XDECREF(exc);
3337 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 return NULL;
3339}
3340
3341PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3342 PyObject *mapping)
3343{
3344 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3345 PyErr_BadArgument();
3346 return NULL;
3347 }
3348 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3349 PyUnicode_GET_SIZE(unicode),
3350 mapping,
3351 NULL);
3352}
3353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354/* create or adjust a UnicodeTranslateError */
3355static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003356 const Py_UNICODE *unicode, Py_ssize_t size,
3357 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003360 if (*exceptionObject == NULL) {
3361 *exceptionObject = PyUnicodeTranslateError_Create(
3362 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 }
3364 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3366 goto onError;
3367 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3368 goto onError;
3369 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3370 goto onError;
3371 return;
3372 onError:
3373 Py_DECREF(*exceptionObject);
3374 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375 }
3376}
3377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378/* raises a UnicodeTranslateError */
3379static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003380 const Py_UNICODE *unicode, Py_ssize_t size,
3381 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 const char *reason)
3383{
3384 make_translate_exception(exceptionObject,
3385 unicode, size, startpos, endpos, reason);
3386 if (*exceptionObject != NULL)
3387 PyCodec_StrictErrors(*exceptionObject);
3388}
3389
3390/* error handling callback helper:
3391 build arguments, call the callback and check the arguments,
3392 put the result into newpos and return the replacement string, which
3393 has to be freed by the caller */
3394static PyObject *unicode_translate_call_errorhandler(const char *errors,
3395 PyObject **errorHandler,
3396 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003397 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3398 Py_ssize_t startpos, Py_ssize_t endpos,
3399 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003401 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402
Martin v. Löwis412fb672006-04-13 06:34:32 +00003403 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 PyObject *restuple;
3405 PyObject *resunicode;
3406
3407 if (*errorHandler == NULL) {
3408 *errorHandler = PyCodec_LookupError(errors);
3409 if (*errorHandler == NULL)
3410 return NULL;
3411 }
3412
3413 make_translate_exception(exceptionObject,
3414 unicode, size, startpos, endpos, reason);
3415 if (*exceptionObject == NULL)
3416 return NULL;
3417
3418 restuple = PyObject_CallFunctionObjArgs(
3419 *errorHandler, *exceptionObject, NULL);
3420 if (restuple == NULL)
3421 return NULL;
3422 if (!PyTuple_Check(restuple)) {
3423 PyErr_Format(PyExc_TypeError, &argparse[4]);
3424 Py_DECREF(restuple);
3425 return NULL;
3426 }
3427 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003428 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 Py_DECREF(restuple);
3430 return NULL;
3431 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003432 if (i_newpos<0)
3433 *newpos = size+i_newpos;
3434 else
3435 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003436 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003437 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003438 Py_DECREF(restuple);
3439 return NULL;
3440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003441 Py_INCREF(resunicode);
3442 Py_DECREF(restuple);
3443 return resunicode;
3444}
3445
3446/* Lookup the character ch in the mapping and put the result in result,
3447 which must be decrefed by the caller.
3448 Return 0 on success, -1 on error */
3449static
3450int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3451{
3452 PyObject *w = PyInt_FromLong((long)c);
3453 PyObject *x;
3454
3455 if (w == NULL)
3456 return -1;
3457 x = PyObject_GetItem(mapping, w);
3458 Py_DECREF(w);
3459 if (x == NULL) {
3460 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3461 /* No mapping found means: use 1:1 mapping. */
3462 PyErr_Clear();
3463 *result = NULL;
3464 return 0;
3465 } else
3466 return -1;
3467 }
3468 else if (x == Py_None) {
3469 *result = x;
3470 return 0;
3471 }
3472 else if (PyInt_Check(x)) {
3473 long value = PyInt_AS_LONG(x);
3474 long max = PyUnicode_GetMax();
3475 if (value < 0 || value > max) {
3476 PyErr_Format(PyExc_TypeError,
3477 "character mapping must be in range(0x%lx)", max+1);
3478 Py_DECREF(x);
3479 return -1;
3480 }
3481 *result = x;
3482 return 0;
3483 }
3484 else if (PyUnicode_Check(x)) {
3485 *result = x;
3486 return 0;
3487 }
3488 else {
3489 /* wrong return value */
3490 PyErr_SetString(PyExc_TypeError,
3491 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003492 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 return -1;
3494 }
3495}
3496/* ensure that *outobj is at least requiredsize characters long,
3497if not reallocate and adjust various state variables.
3498Return 0 on success, -1 on error */
3499static
Walter Dörwald4894c302003-10-24 14:25:28 +00003500int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003501 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003503 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003504 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003506 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003508 if (requiredsize < 2 * oldsize)
3509 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003510 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 return -1;
3512 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 }
3514 return 0;
3515}
3516/* lookup the character, put the result in the output string and adjust
3517 various state variables. Return a new reference to the object that
3518 was put in the output buffer in *result, or Py_None, if the mapping was
3519 undefined (in which case no character was written).
3520 The called must decref result.
3521 Return 0 on success, -1 on error. */
3522static
Walter Dörwald4894c302003-10-24 14:25:28 +00003523int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003524 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003525 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526{
Walter Dörwald4894c302003-10-24 14:25:28 +00003527 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 return -1;
3529 if (*res==NULL) {
3530 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003531 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 }
3533 else if (*res==Py_None)
3534 ;
3535 else if (PyInt_Check(*res)) {
3536 /* no overflow check, because we know that the space is enough */
3537 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3538 }
3539 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003540 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (repsize==1) {
3542 /* no overflow check, because we know that the space is enough */
3543 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3544 }
3545 else if (repsize!=0) {
3546 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003547 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003548 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003549 repsize - 1;
3550 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 return -1;
3552 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3553 *outp += repsize;
3554 }
3555 }
3556 else
3557 return -1;
3558 return 0;
3559}
3560
3561PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003562 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 PyObject *mapping,
3564 const char *errors)
3565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 /* output object */
3567 PyObject *res = NULL;
3568 /* pointers to the beginning and end+1 of input */
3569 const Py_UNICODE *startp = p;
3570 const Py_UNICODE *endp = p + size;
3571 /* pointer into the output */
3572 Py_UNICODE *str;
3573 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003574 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 char *reason = "character maps to <undefined>";
3576 PyObject *errorHandler = NULL;
3577 PyObject *exc = NULL;
3578 /* the following variable is used for caching string comparisons
3579 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3580 * 3=ignore, 4=xmlcharrefreplace */
3581 int known_errorHandler = -1;
3582
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 if (mapping == NULL) {
3584 PyErr_BadArgument();
3585 return NULL;
3586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587
3588 /* allocate enough for a simple 1:1 translation without
3589 replacements, if we need more, we'll resize */
3590 res = PyUnicode_FromUnicode(NULL, size);
3591 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003592 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 return res;
3595 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 while (p<endp) {
3598 /* try to encode it */
3599 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003600 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602 goto onError;
3603 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003604 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 if (x!=Py_None) /* it worked => adjust input pointer */
3606 ++p;
3607 else { /* untranslatable character */
3608 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003609 Py_ssize_t repsize;
3610 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 Py_UNICODE *uni2;
3612 /* startpos for collecting untranslatable chars */
3613 const Py_UNICODE *collstart = p;
3614 const Py_UNICODE *collend = p+1;
3615 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 /* find all untranslatable characters */
3618 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003619 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 goto onError;
3621 Py_XDECREF(x);
3622 if (x!=Py_None)
3623 break;
3624 ++collend;
3625 }
3626 /* cache callback name lookup
3627 * (if not done yet, i.e. it's the first error) */
3628 if (known_errorHandler==-1) {
3629 if ((errors==NULL) || (!strcmp(errors, "strict")))
3630 known_errorHandler = 1;
3631 else if (!strcmp(errors, "replace"))
3632 known_errorHandler = 2;
3633 else if (!strcmp(errors, "ignore"))
3634 known_errorHandler = 3;
3635 else if (!strcmp(errors, "xmlcharrefreplace"))
3636 known_errorHandler = 4;
3637 else
3638 known_errorHandler = 0;
3639 }
3640 switch (known_errorHandler) {
3641 case 1: /* strict */
3642 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3643 goto onError;
3644 case 2: /* replace */
3645 /* No need to check for space, this is a 1:1 replacement */
3646 for (coll = collstart; coll<collend; ++coll)
3647 *str++ = '?';
3648 /* fall through */
3649 case 3: /* ignore */
3650 p = collend;
3651 break;
3652 case 4: /* xmlcharrefreplace */
3653 /* generate replacement (temporarily (mis)uses p) */
3654 for (p = collstart; p < collend; ++p) {
3655 char buffer[2+29+1+1];
3656 char *cp;
3657 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003658 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3660 goto onError;
3661 for (cp = buffer; *cp; ++cp)
3662 *str++ = *cp;
3663 }
3664 p = collend;
3665 break;
3666 default:
3667 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3668 reason, startp, size, &exc,
3669 collstart-startp, collend-startp, &newpos);
3670 if (repunicode == NULL)
3671 goto onError;
3672 /* generate replacement */
3673 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003674 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3676 Py_DECREF(repunicode);
3677 goto onError;
3678 }
3679 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3680 *str++ = *uni2;
3681 p = startp + newpos;
3682 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 }
3684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 /* Resize if we allocated to much */
3687 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003688 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003689 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003690 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 }
3692 Py_XDECREF(exc);
3693 Py_XDECREF(errorHandler);
3694 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 onError:
3697 Py_XDECREF(res);
3698 Py_XDECREF(exc);
3699 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 return NULL;
3701}
3702
3703PyObject *PyUnicode_Translate(PyObject *str,
3704 PyObject *mapping,
3705 const char *errors)
3706{
3707 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003708
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 str = PyUnicode_FromObject(str);
3710 if (str == NULL)
3711 goto onError;
3712 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3713 PyUnicode_GET_SIZE(str),
3714 mapping,
3715 errors);
3716 Py_DECREF(str);
3717 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003718
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 onError:
3720 Py_XDECREF(str);
3721 return NULL;
3722}
Tim Petersced69f82003-09-16 20:30:58 +00003723
Guido van Rossum9e896b32000-04-05 20:11:21 +00003724/* --- Decimal Encoder ---------------------------------------------------- */
3725
3726int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003727 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003728 char *output,
3729 const char *errors)
3730{
3731 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 PyObject *errorHandler = NULL;
3733 PyObject *exc = NULL;
3734 const char *encoding = "decimal";
3735 const char *reason = "invalid decimal Unicode string";
3736 /* the following variable is used for caching string comparisons
3737 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3738 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003739
3740 if (output == NULL) {
3741 PyErr_BadArgument();
3742 return -1;
3743 }
3744
3745 p = s;
3746 end = s + length;
3747 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003749 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003751 Py_ssize_t repsize;
3752 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 Py_UNICODE *uni2;
3754 Py_UNICODE *collstart;
3755 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003756
Guido van Rossum9e896b32000-04-05 20:11:21 +00003757 if (Py_UNICODE_ISSPACE(ch)) {
3758 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003760 continue;
3761 }
3762 decimal = Py_UNICODE_TODECIMAL(ch);
3763 if (decimal >= 0) {
3764 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003766 continue;
3767 }
Guido van Rossumba477042000-04-06 18:18:10 +00003768 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003769 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003771 continue;
3772 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 /* All other characters are considered unencodable */
3774 collstart = p;
3775 collend = p+1;
3776 while (collend < end) {
3777 if ((0 < *collend && *collend < 256) ||
3778 !Py_UNICODE_ISSPACE(*collend) ||
3779 Py_UNICODE_TODECIMAL(*collend))
3780 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003781 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 /* cache callback name lookup
3783 * (if not done yet, i.e. it's the first error) */
3784 if (known_errorHandler==-1) {
3785 if ((errors==NULL) || (!strcmp(errors, "strict")))
3786 known_errorHandler = 1;
3787 else if (!strcmp(errors, "replace"))
3788 known_errorHandler = 2;
3789 else if (!strcmp(errors, "ignore"))
3790 known_errorHandler = 3;
3791 else if (!strcmp(errors, "xmlcharrefreplace"))
3792 known_errorHandler = 4;
3793 else
3794 known_errorHandler = 0;
3795 }
3796 switch (known_errorHandler) {
3797 case 1: /* strict */
3798 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3799 goto onError;
3800 case 2: /* replace */
3801 for (p = collstart; p < collend; ++p)
3802 *output++ = '?';
3803 /* fall through */
3804 case 3: /* ignore */
3805 p = collend;
3806 break;
3807 case 4: /* xmlcharrefreplace */
3808 /* generate replacement (temporarily (mis)uses p) */
3809 for (p = collstart; p < collend; ++p)
3810 output += sprintf(output, "&#%d;", (int)*p);
3811 p = collend;
3812 break;
3813 default:
3814 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3815 encoding, reason, s, length, &exc,
3816 collstart-s, collend-s, &newpos);
3817 if (repunicode == NULL)
3818 goto onError;
3819 /* generate replacement */
3820 repsize = PyUnicode_GET_SIZE(repunicode);
3821 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3822 Py_UNICODE ch = *uni2;
3823 if (Py_UNICODE_ISSPACE(ch))
3824 *output++ = ' ';
3825 else {
3826 decimal = Py_UNICODE_TODECIMAL(ch);
3827 if (decimal >= 0)
3828 *output++ = '0' + decimal;
3829 else if (0 < ch && ch < 256)
3830 *output++ = (char)ch;
3831 else {
3832 Py_DECREF(repunicode);
3833 raise_encode_exception(&exc, encoding,
3834 s, length, collstart-s, collend-s, reason);
3835 goto onError;
3836 }
3837 }
3838 }
3839 p = s + newpos;
3840 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003841 }
3842 }
3843 /* 0-terminate the output string */
3844 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 Py_XDECREF(exc);
3846 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003847 return 0;
3848
3849 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 Py_XDECREF(exc);
3851 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003852 return -1;
3853}
3854
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855/* --- Helpers ------------------------------------------------------------ */
3856
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003857/* fast search/count implementation, based on a mix between boyer-
3858 moore and horspool, with a few more bells and whistles on the top.
3859 for some more background, see: http://effbot.org/stringlib */
3860
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003861/* note: fastsearch may access s[n], which isn't a problem when using
Fredrik Lundh0c71f882006-05-25 16:46:54 +00003862 Python's ordinary string types, but may cause problems if you're
3863 using this code in other contexts. also, the count mode returns -1
3864 if there cannot possible be a match in the target string, and 0 if
3865 it has actually checked for matches, but didn't find any. callers
3866 beware! */
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003867
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003868#define FAST_COUNT 0
3869#define FAST_SEARCH 1
3870
Fredrik Lundh95e2a912006-05-26 11:38:15 +00003871Py_LOCAL(Py_ssize_t)
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003872fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003873{
3874 long mask;
Fredrik Lundhf2c0dfd2006-05-26 10:27:17 +00003875 Py_ssize_t skip, count = 0;
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003876 Py_ssize_t i, j, mlast, w;
3877
3878 w = n - m;
3879
3880 if (w < 0)
3881 return -1;
3882
3883 /* look for special cases */
3884 if (m <= 1) {
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00003885 if (m <= 0)
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003886 return -1;
3887 /* use special case for 1-character strings */
3888 if (mode == FAST_COUNT) {
3889 for (i = 0; i < n; i++)
3890 if (s[i] == p[0])
3891 count++;
3892 return count;
3893 } else {
3894 for (i = 0; i < n; i++)
3895 if (s[i] == p[0])
3896 return i;
3897 }
3898 return -1;
3899 }
3900
3901 mlast = m - 1;
3902
3903 /* create compressed boyer-moore delta 1 table */
3904 skip = mlast - 1;
3905 /* process pattern[:-1] */
3906 for (mask = i = 0; i < mlast; i++) {
3907 mask |= (1 << (p[i] & 0x1F));
3908 if (p[i] == p[mlast])
3909 skip = mlast - i - 1;
3910 }
3911 /* process pattern[-1] outside the loop */
3912 mask |= (1 << (p[mlast] & 0x1F));
3913
3914 for (i = 0; i <= w; i++) {
3915 /* note: using mlast in the skip path slows things down on x86 */
3916 if (s[i+m-1] == p[m-1]) {
3917 /* candidate match */
3918 for (j = 0; j < mlast; j++)
3919 if (s[i+j] != p[j])
3920 break;
3921 if (j == mlast) {
3922 /* got a match! */
3923 if (mode != FAST_COUNT)
3924 return i;
3925 count++;
3926 i = i + mlast;
3927 continue;
3928 }
3929 /* miss: check if next character is part of pattern */
3930 if (!(mask & (1 << (s[i+m] & 0x1F))))
3931 i = i + m;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00003932 else
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003933 i = i + skip;
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003934 } else {
3935 /* skip: check if next character is part of pattern */
3936 if (!(mask & (1 << (s[i+m] & 0x1F))))
3937 i = i + m;
3938 }
3939 }
3940
3941 if (mode != FAST_COUNT)
3942 return -1;
3943 return count;
3944}
3945
Fredrik Lundh95e2a912006-05-26 11:38:15 +00003946Py_LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
3947 Py_ssize_t start,
3948 Py_ssize_t end,
3949 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003951 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003953 if (start < 0)
3954 start += self->length;
3955 if (start < 0)
3956 start = 0;
3957 if (end > self->length)
3958 end = self->length;
3959 if (end < 0)
3960 end += self->length;
3961 if (end < 0)
3962 end = 0;
3963
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003964 if (substring->length == 0)
3965 return (end - start + 1);
3966
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003967 count = fastsearch(
3968 PyUnicode_AS_UNICODE(self) + start, end - start,
3969 substring->str, substring->length, FAST_COUNT
3970 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00003971
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003972 if (count < 0)
3973 count = 0; /* no match */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974
3975 return count;
3976}
3977
Martin v. Löwis18e16552006-02-15 17:27:45 +00003978Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980 Py_ssize_t start,
3981 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003983 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003984
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 str = PyUnicode_FromObject(str);
3986 if (str == NULL)
3987 return -1;
3988 substr = PyUnicode_FromObject(substr);
3989 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003990 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 return -1;
3992 }
Tim Petersced69f82003-09-16 20:30:58 +00003993
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 result = count((PyUnicodeObject *)str,
3995 start, end,
3996 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003997
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 Py_DECREF(str);
3999 Py_DECREF(substr);
4000 return result;
4001}
4002
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004003static Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004005 Py_ssize_t start,
4006 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 int direction)
4008{
4009 if (start < 0)
4010 start += self->length;
4011 if (start < 0)
4012 start = 0;
4013
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 if (end > self->length)
4015 end = self->length;
4016 if (end < 0)
4017 end += self->length;
4018 if (end < 0)
4019 end = 0;
4020
Guido van Rossum76afbd92002-08-20 17:29:29 +00004021 if (substring->length == 0)
4022 return (direction > 0) ? start : end;
4023
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004024 if (direction > 0) {
4025 Py_ssize_t pos = fastsearch(
4026 PyUnicode_AS_UNICODE(self) + start, end - start,
4027 substring->str, substring->length, FAST_SEARCH
4028 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00004029 if (pos >= 0)
4030 return pos + start;
4031 } else {
4032 end -= substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 for (; end >= start; end--)
4034 if (Py_UNICODE_MATCH(self, end, substring))
4035 return end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 return -1;
4038}
4039
Martin v. Löwis18e16552006-02-15 17:27:45 +00004040Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004042 Py_ssize_t start,
4043 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 int direction)
4045{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004046 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 str = PyUnicode_FromObject(str);
4049 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004050 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 substr = PyUnicode_FromObject(substr);
4052 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00004053 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004054 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 }
Tim Petersced69f82003-09-16 20:30:58 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 result = findstring((PyUnicodeObject *)str,
4058 (PyUnicodeObject *)substr,
4059 start, end, direction);
4060 Py_DECREF(str);
4061 Py_DECREF(substr);
4062 return result;
4063}
4064
Tim Petersced69f82003-09-16 20:30:58 +00004065static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066int tailmatch(PyUnicodeObject *self,
4067 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t start,
4069 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 int direction)
4071{
4072 if (start < 0)
4073 start += self->length;
4074 if (start < 0)
4075 start = 0;
4076
4077 if (substring->length == 0)
4078 return 1;
4079
4080 if (end > self->length)
4081 end = self->length;
4082 if (end < 0)
4083 end += self->length;
4084 if (end < 0)
4085 end = 0;
4086
4087 end -= substring->length;
4088 if (end < start)
4089 return 0;
4090
4091 if (direction > 0) {
4092 if (Py_UNICODE_MATCH(self, end, substring))
4093 return 1;
4094 } else {
4095 if (Py_UNICODE_MATCH(self, start, substring))
4096 return 1;
4097 }
4098
4099 return 0;
4100}
4101
Martin v. Löwis18e16552006-02-15 17:27:45 +00004102Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004104 Py_ssize_t start,
4105 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 int direction)
4107{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004108 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004109
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110 str = PyUnicode_FromObject(str);
4111 if (str == NULL)
4112 return -1;
4113 substr = PyUnicode_FromObject(substr);
4114 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004115 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 return -1;
4117 }
Tim Petersced69f82003-09-16 20:30:58 +00004118
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 result = tailmatch((PyUnicodeObject *)str,
4120 (PyUnicodeObject *)substr,
4121 start, end, direction);
4122 Py_DECREF(str);
4123 Py_DECREF(substr);
4124 return result;
4125}
4126
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127/* Apply fixfct filter to the Unicode object self and return a
4128 reference to the modified object */
4129
Tim Petersced69f82003-09-16 20:30:58 +00004130static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131PyObject *fixup(PyUnicodeObject *self,
4132 int (*fixfct)(PyUnicodeObject *s))
4133{
4134
4135 PyUnicodeObject *u;
4136
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004137 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 if (u == NULL)
4139 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004140
4141 Py_UNICODE_COPY(u->str, self->str, self->length);
4142
Tim Peters7a29bd52001-09-12 03:03:31 +00004143 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 /* fixfct should return TRUE if it modified the buffer. If
4145 FALSE, return a reference to the original buffer instead
4146 (to save space, not time) */
4147 Py_INCREF(self);
4148 Py_DECREF(u);
4149 return (PyObject*) self;
4150 }
4151 return (PyObject*) u;
4152}
4153
Tim Petersced69f82003-09-16 20:30:58 +00004154static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155int fixupper(PyUnicodeObject *self)
4156{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004157 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 Py_UNICODE *s = self->str;
4159 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004160
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 while (len-- > 0) {
4162 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004163
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 ch = Py_UNICODE_TOUPPER(*s);
4165 if (ch != *s) {
4166 status = 1;
4167 *s = ch;
4168 }
4169 s++;
4170 }
4171
4172 return status;
4173}
4174
Tim Petersced69f82003-09-16 20:30:58 +00004175static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176int fixlower(PyUnicodeObject *self)
4177{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004178 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 Py_UNICODE *s = self->str;
4180 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004181
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 while (len-- > 0) {
4183 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004184
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 ch = Py_UNICODE_TOLOWER(*s);
4186 if (ch != *s) {
4187 status = 1;
4188 *s = ch;
4189 }
4190 s++;
4191 }
4192
4193 return status;
4194}
4195
Tim Petersced69f82003-09-16 20:30:58 +00004196static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197int fixswapcase(PyUnicodeObject *self)
4198{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004199 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 Py_UNICODE *s = self->str;
4201 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004202
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 while (len-- > 0) {
4204 if (Py_UNICODE_ISUPPER(*s)) {
4205 *s = Py_UNICODE_TOLOWER(*s);
4206 status = 1;
4207 } else if (Py_UNICODE_ISLOWER(*s)) {
4208 *s = Py_UNICODE_TOUPPER(*s);
4209 status = 1;
4210 }
4211 s++;
4212 }
4213
4214 return status;
4215}
4216
Tim Petersced69f82003-09-16 20:30:58 +00004217static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218int fixcapitalize(PyUnicodeObject *self)
4219{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004220 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004221 Py_UNICODE *s = self->str;
4222 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004223
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004224 if (len == 0)
4225 return 0;
4226 if (Py_UNICODE_ISLOWER(*s)) {
4227 *s = Py_UNICODE_TOUPPER(*s);
4228 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004230 s++;
4231 while (--len > 0) {
4232 if (Py_UNICODE_ISUPPER(*s)) {
4233 *s = Py_UNICODE_TOLOWER(*s);
4234 status = 1;
4235 }
4236 s++;
4237 }
4238 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239}
4240
4241static
4242int fixtitle(PyUnicodeObject *self)
4243{
4244 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4245 register Py_UNICODE *e;
4246 int previous_is_cased;
4247
4248 /* Shortcut for single character strings */
4249 if (PyUnicode_GET_SIZE(self) == 1) {
4250 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4251 if (*p != ch) {
4252 *p = ch;
4253 return 1;
4254 }
4255 else
4256 return 0;
4257 }
Tim Petersced69f82003-09-16 20:30:58 +00004258
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 e = p + PyUnicode_GET_SIZE(self);
4260 previous_is_cased = 0;
4261 for (; p < e; p++) {
4262 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004263
Guido van Rossumd57fd912000-03-10 22:53:23 +00004264 if (previous_is_cased)
4265 *p = Py_UNICODE_TOLOWER(ch);
4266 else
4267 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004268
4269 if (Py_UNICODE_ISLOWER(ch) ||
4270 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 Py_UNICODE_ISTITLE(ch))
4272 previous_is_cased = 1;
4273 else
4274 previous_is_cased = 0;
4275 }
4276 return 1;
4277}
4278
Tim Peters8ce9f162004-08-27 01:49:32 +00004279PyObject *
4280PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281{
Tim Peters8ce9f162004-08-27 01:49:32 +00004282 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004283 const Py_UNICODE blank = ' ';
4284 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004285 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004286 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004287 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4288 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004289 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4290 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004291 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004292 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004293 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294
Tim Peters05eba1f2004-08-27 21:32:02 +00004295 fseq = PySequence_Fast(seq, "");
4296 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004297 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004298 }
4299
Tim Peters91879ab2004-08-27 22:35:44 +00004300 /* Grrrr. A codec may be invoked to convert str objects to
4301 * Unicode, and so it's possible to call back into Python code
4302 * during PyUnicode_FromObject(), and so it's possible for a sick
4303 * codec to change the size of fseq (if seq is a list). Therefore
4304 * we have to keep refetching the size -- can't assume seqlen
4305 * is invariant.
4306 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004307 seqlen = PySequence_Fast_GET_SIZE(fseq);
4308 /* If empty sequence, return u"". */
4309 if (seqlen == 0) {
4310 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4311 goto Done;
4312 }
4313 /* If singleton sequence with an exact Unicode, return that. */
4314 if (seqlen == 1) {
4315 item = PySequence_Fast_GET_ITEM(fseq, 0);
4316 if (PyUnicode_CheckExact(item)) {
4317 Py_INCREF(item);
4318 res = (PyUnicodeObject *)item;
4319 goto Done;
4320 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004321 }
4322
Tim Peters05eba1f2004-08-27 21:32:02 +00004323 /* At least two items to join, or one that isn't exact Unicode. */
4324 if (seqlen > 1) {
4325 /* Set up sep and seplen -- they're needed. */
4326 if (separator == NULL) {
4327 sep = &blank;
4328 seplen = 1;
4329 }
4330 else {
4331 internal_separator = PyUnicode_FromObject(separator);
4332 if (internal_separator == NULL)
4333 goto onError;
4334 sep = PyUnicode_AS_UNICODE(internal_separator);
4335 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004336 /* In case PyUnicode_FromObject() mutated seq. */
4337 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004338 }
4339 }
4340
4341 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004342 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004343 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004344 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004345 res_p = PyUnicode_AS_UNICODE(res);
4346 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004347
Tim Peters05eba1f2004-08-27 21:32:02 +00004348 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004349 Py_ssize_t itemlen;
4350 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004351
4352 item = PySequence_Fast_GET_ITEM(fseq, i);
4353 /* Convert item to Unicode. */
4354 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4355 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004356 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004357 " %.80s found",
4358 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004359 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004360 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004361 item = PyUnicode_FromObject(item);
4362 if (item == NULL)
4363 goto onError;
4364 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004365
Tim Peters91879ab2004-08-27 22:35:44 +00004366 /* In case PyUnicode_FromObject() mutated seq. */
4367 seqlen = PySequence_Fast_GET_SIZE(fseq);
4368
Tim Peters8ce9f162004-08-27 01:49:32 +00004369 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004371 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004372 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004373 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004374 if (i < seqlen - 1) {
4375 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004376 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004377 goto Overflow;
4378 }
4379 if (new_res_used > res_alloc) {
4380 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004381 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004382 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004383 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004384 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004385 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004386 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004387 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004389 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004390 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004392
4393 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004394 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004395 res_p += itemlen;
4396 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004397 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004398 res_p += seplen;
4399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004401 res_used = new_res_used;
4402 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004403
Tim Peters05eba1f2004-08-27 21:32:02 +00004404 /* Shrink res to match the used area; this probably can't fail,
4405 * but it's cheap to check.
4406 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004407 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004408 goto onError;
4409
4410 Done:
4411 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004412 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 return (PyObject *)res;
4414
Tim Peters8ce9f162004-08-27 01:49:32 +00004415 Overflow:
4416 PyErr_SetString(PyExc_OverflowError,
4417 "join() is too long for a Python string");
4418 Py_DECREF(item);
4419 /* fall through */
4420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004422 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004423 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004424 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 return NULL;
4426}
4427
Tim Petersced69f82003-09-16 20:30:58 +00004428static
4429PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004430 Py_ssize_t left,
4431 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 Py_UNICODE fill)
4433{
4434 PyUnicodeObject *u;
4435
4436 if (left < 0)
4437 left = 0;
4438 if (right < 0)
4439 right = 0;
4440
Tim Peters7a29bd52001-09-12 03:03:31 +00004441 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 Py_INCREF(self);
4443 return self;
4444 }
4445
4446 u = _PyUnicode_New(left + self->length + right);
4447 if (u) {
4448 if (left)
4449 Py_UNICODE_FILL(u->str, fill, left);
4450 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4451 if (right)
4452 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4453 }
4454
4455 return u;
4456}
4457
4458#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004459 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 if (!str) \
4461 goto onError; \
4462 if (PyList_Append(list, str)) { \
4463 Py_DECREF(str); \
4464 goto onError; \
4465 } \
4466 else \
4467 Py_DECREF(str);
4468
4469static
4470PyObject *split_whitespace(PyUnicodeObject *self,
4471 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004472 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004474 register Py_ssize_t i;
4475 register Py_ssize_t j;
4476 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 PyObject *str;
4478
4479 for (i = j = 0; i < len; ) {
4480 /* find a token */
4481 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4482 i++;
4483 j = i;
4484 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4485 i++;
4486 if (j < i) {
4487 if (maxcount-- <= 0)
4488 break;
4489 SPLIT_APPEND(self->str, j, i);
4490 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4491 i++;
4492 j = i;
4493 }
4494 }
4495 if (j < len) {
4496 SPLIT_APPEND(self->str, j, len);
4497 }
4498 return list;
4499
4500 onError:
4501 Py_DECREF(list);
4502 return NULL;
4503}
4504
4505PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004506 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004508 register Py_ssize_t i;
4509 register Py_ssize_t j;
4510 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511 PyObject *list;
4512 PyObject *str;
4513 Py_UNICODE *data;
4514
4515 string = PyUnicode_FromObject(string);
4516 if (string == NULL)
4517 return NULL;
4518 data = PyUnicode_AS_UNICODE(string);
4519 len = PyUnicode_GET_SIZE(string);
4520
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 list = PyList_New(0);
4522 if (!list)
4523 goto onError;
4524
4525 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004526 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004527
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004529 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531
4532 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004533 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 if (i < len) {
4535 if (data[i] == '\r' && i + 1 < len &&
4536 data[i+1] == '\n')
4537 i += 2;
4538 else
4539 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004540 if (keepends)
4541 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 }
Guido van Rossum86662912000-04-11 15:38:46 +00004543 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 j = i;
4545 }
4546 if (j < len) {
4547 SPLIT_APPEND(data, j, len);
4548 }
4549
4550 Py_DECREF(string);
4551 return list;
4552
4553 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004554 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 Py_DECREF(string);
4556 return NULL;
4557}
4558
Tim Petersced69f82003-09-16 20:30:58 +00004559static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560PyObject *split_char(PyUnicodeObject *self,
4561 PyObject *list,
4562 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004565 register Py_ssize_t i;
4566 register Py_ssize_t j;
4567 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568 PyObject *str;
4569
4570 for (i = j = 0; i < len; ) {
4571 if (self->str[i] == ch) {
4572 if (maxcount-- <= 0)
4573 break;
4574 SPLIT_APPEND(self->str, j, i);
4575 i = j = i + 1;
4576 } else
4577 i++;
4578 }
4579 if (j <= len) {
4580 SPLIT_APPEND(self->str, j, len);
4581 }
4582 return list;
4583
4584 onError:
4585 Py_DECREF(list);
4586 return NULL;
4587}
4588
Tim Petersced69f82003-09-16 20:30:58 +00004589static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590PyObject *split_substring(PyUnicodeObject *self,
4591 PyObject *list,
4592 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004595 register Py_ssize_t i;
4596 register Py_ssize_t j;
4597 Py_ssize_t len = self->length;
4598 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 PyObject *str;
4600
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004601 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602 if (Py_UNICODE_MATCH(self, i, substring)) {
4603 if (maxcount-- <= 0)
4604 break;
4605 SPLIT_APPEND(self->str, j, i);
4606 i = j = i + sublen;
4607 } else
4608 i++;
4609 }
4610 if (j <= len) {
4611 SPLIT_APPEND(self->str, j, len);
4612 }
4613 return list;
4614
4615 onError:
4616 Py_DECREF(list);
4617 return NULL;
4618}
4619
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004620static
4621PyObject *rsplit_whitespace(PyUnicodeObject *self,
4622 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004623 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004624{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004625 register Py_ssize_t i;
4626 register Py_ssize_t j;
4627 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004628 PyObject *str;
4629
4630 for (i = j = len - 1; i >= 0; ) {
4631 /* find a token */
4632 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4633 i--;
4634 j = i;
4635 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4636 i--;
4637 if (j > i) {
4638 if (maxcount-- <= 0)
4639 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004640 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004641 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4642 i--;
4643 j = i;
4644 }
4645 }
4646 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004647 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004648 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004649 if (PyList_Reverse(list) < 0)
4650 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004651 return list;
4652
4653 onError:
4654 Py_DECREF(list);
4655 return NULL;
4656}
4657
4658static
4659PyObject *rsplit_char(PyUnicodeObject *self,
4660 PyObject *list,
4661 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004662 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004663{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004664 register Py_ssize_t i;
4665 register Py_ssize_t j;
4666 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004667 PyObject *str;
4668
4669 for (i = j = len - 1; i >= 0; ) {
4670 if (self->str[i] == ch) {
4671 if (maxcount-- <= 0)
4672 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004673 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004674 j = i = i - 1;
4675 } else
4676 i--;
4677 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004678 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004679 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004680 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004681 if (PyList_Reverse(list) < 0)
4682 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004683 return list;
4684
4685 onError:
4686 Py_DECREF(list);
4687 return NULL;
4688}
4689
4690static
4691PyObject *rsplit_substring(PyUnicodeObject *self,
4692 PyObject *list,
4693 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004694 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004696 register Py_ssize_t i;
4697 register Py_ssize_t j;
4698 Py_ssize_t len = self->length;
4699 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004700 PyObject *str;
4701
4702 for (i = len - sublen, j = len; i >= 0; ) {
4703 if (Py_UNICODE_MATCH(self, i, substring)) {
4704 if (maxcount-- <= 0)
4705 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004706 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004707 j = i;
4708 i -= sublen;
4709 } else
4710 i--;
4711 }
4712 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004713 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004714 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004715 if (PyList_Reverse(list) < 0)
4716 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004717 return list;
4718
4719 onError:
4720 Py_DECREF(list);
4721 return NULL;
4722}
4723
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724#undef SPLIT_APPEND
4725
4726static
4727PyObject *split(PyUnicodeObject *self,
4728 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004729 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730{
4731 PyObject *list;
4732
4733 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004734 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735
4736 list = PyList_New(0);
4737 if (!list)
4738 return NULL;
4739
4740 if (substring == NULL)
4741 return split_whitespace(self,list,maxcount);
4742
4743 else if (substring->length == 1)
4744 return split_char(self,list,substring->str[0],maxcount);
4745
4746 else if (substring->length == 0) {
4747 Py_DECREF(list);
4748 PyErr_SetString(PyExc_ValueError, "empty separator");
4749 return NULL;
4750 }
4751 else
4752 return split_substring(self,list,substring,maxcount);
4753}
4754
Tim Petersced69f82003-09-16 20:30:58 +00004755static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004756PyObject *rsplit(PyUnicodeObject *self,
4757 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004759{
4760 PyObject *list;
4761
4762 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004763 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004764
4765 list = PyList_New(0);
4766 if (!list)
4767 return NULL;
4768
4769 if (substring == NULL)
4770 return rsplit_whitespace(self,list,maxcount);
4771
4772 else if (substring->length == 1)
4773 return rsplit_char(self,list,substring->str[0],maxcount);
4774
4775 else if (substring->length == 0) {
4776 Py_DECREF(list);
4777 PyErr_SetString(PyExc_ValueError, "empty separator");
4778 return NULL;
4779 }
4780 else
4781 return rsplit_substring(self,list,substring,maxcount);
4782}
4783
4784static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785PyObject *replace(PyUnicodeObject *self,
4786 PyUnicodeObject *str1,
4787 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789{
4790 PyUnicodeObject *u;
4791
4792 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004793 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
Fredrik Lundh347ee272006-05-24 16:35:18 +00004795 if (str1->length == str2->length) {
4796 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004797 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004798 if (str1->length == 1) {
4799 /* replace characters */
4800 Py_UNICODE u1, u2;
4801 if (!findchar(self->str, self->length, str1->str[0]))
4802 goto nothing;
4803 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4804 if (!u)
4805 return NULL;
4806 Py_UNICODE_COPY(u->str, self->str, self->length);
4807 u1 = str1->str[0];
4808 u2 = str2->str[0];
4809 for (i = 0; i < u->length; i++)
4810 if (u->str[i] == u1) {
4811 if (--maxcount < 0)
4812 break;
4813 u->str[i] = u2;
4814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004816 i = fastsearch(
4817 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00004819 if (i < 0)
4820 goto nothing;
4821 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4822 if (!u)
4823 return NULL;
4824 Py_UNICODE_COPY(u->str, self->str, self->length);
4825 while (i <= self->length - str1->length)
4826 if (Py_UNICODE_MATCH(self, i, str1)) {
4827 if (--maxcount < 0)
4828 break;
4829 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
4830 i += str1->length;
4831 } else
4832 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004835
Martin v. Löwis18e16552006-02-15 17:27:45 +00004836 Py_ssize_t n, i;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004837 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 Py_UNICODE *p;
4839
4840 /* replace strings */
4841 n = count(self, 0, self->length, str1);
4842 if (n > maxcount)
4843 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004844 if (n == 0)
4845 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004846 /* new_size = self->length + n * (str2->length - str1->length)); */
4847 delta = (str2->length - str1->length);
4848 if (delta == 0) {
4849 new_size = self->length;
4850 } else {
4851 product = n * (str2->length - str1->length);
4852 if ((product / (str2->length - str1->length)) != n) {
4853 PyErr_SetString(PyExc_OverflowError,
4854 "replace string is too long");
4855 return NULL;
4856 }
4857 new_size = self->length + product;
4858 if (new_size < 0) {
4859 PyErr_SetString(PyExc_OverflowError,
4860 "replace string is too long");
4861 return NULL;
4862 }
4863 }
4864 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00004865 if (!u)
4866 return NULL;
4867 i = 0;
4868 p = u->str;
4869 if (str1->length > 0) {
4870 while (i <= self->length - str1->length)
4871 if (Py_UNICODE_MATCH(self, i, str1)) {
4872 /* replace string segment */
4873 Py_UNICODE_COPY(p, str2->str, str2->length);
4874 p += str2->length;
4875 i += str1->length;
4876 if (--n <= 0) {
4877 /* copy remaining part */
4878 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4879 break;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004880 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004881 } else
4882 *p++ = self->str[i++];
4883 } else {
4884 while (n > 0) {
4885 Py_UNICODE_COPY(p, str2->str, str2->length);
4886 p += str2->length;
4887 if (--n <= 0)
4888 break;
4889 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004891 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 }
4893 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004895
4896nothing:
4897 /* nothing to replace; return original string (when possible) */
4898 if (PyUnicode_CheckExact(self)) {
4899 Py_INCREF(self);
4900 return (PyObject *) self;
4901 }
4902 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903}
4904
4905/* --- Unicode Object Methods --------------------------------------------- */
4906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004907PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908"S.title() -> unicode\n\
4909\n\
4910Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004911characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912
4913static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004914unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 return fixup(self, fixtitle);
4917}
4918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004919PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920"S.capitalize() -> unicode\n\
4921\n\
4922Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004923have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
4925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004926unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 return fixup(self, fixcapitalize);
4929}
4930
4931#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004932PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933"S.capwords() -> unicode\n\
4934\n\
4935Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004936normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937
4938static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004939unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940{
4941 PyObject *list;
4942 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004943 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945 /* Split into words */
4946 list = split(self, NULL, -1);
4947 if (!list)
4948 return NULL;
4949
4950 /* Capitalize each word */
4951 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4952 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4953 fixcapitalize);
4954 if (item == NULL)
4955 goto onError;
4956 Py_DECREF(PyList_GET_ITEM(list, i));
4957 PyList_SET_ITEM(list, i, item);
4958 }
4959
4960 /* Join the words to form a new string */
4961 item = PyUnicode_Join(NULL, list);
4962
4963onError:
4964 Py_DECREF(list);
4965 return (PyObject *)item;
4966}
4967#endif
4968
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004969/* Argument converter. Coerces to a single unicode character */
4970
4971static int
4972convert_uc(PyObject *obj, void *addr)
4973{
4974 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4975 PyObject *uniobj;
4976 Py_UNICODE *unistr;
4977
4978 uniobj = PyUnicode_FromObject(obj);
4979 if (uniobj == NULL) {
4980 PyErr_SetString(PyExc_TypeError,
4981 "The fill character cannot be converted to Unicode");
4982 return 0;
4983 }
4984 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4985 PyErr_SetString(PyExc_TypeError,
4986 "The fill character must be exactly one character long");
4987 Py_DECREF(uniobj);
4988 return 0;
4989 }
4990 unistr = PyUnicode_AS_UNICODE(uniobj);
4991 *fillcharloc = unistr[0];
4992 Py_DECREF(uniobj);
4993 return 1;
4994}
4995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004996PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004997"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004999Return S centered in a Unicode string of length width. Padding is\n\
5000done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001
5002static PyObject *
5003unicode_center(PyUnicodeObject *self, PyObject *args)
5004{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005005 Py_ssize_t marg, left;
5006 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005007 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008
Thomas Woutersde017742006-02-16 19:34:37 +00005009 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 return NULL;
5011
Tim Peters7a29bd52001-09-12 03:03:31 +00005012 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 Py_INCREF(self);
5014 return (PyObject*) self;
5015 }
5016
5017 marg = width - self->length;
5018 left = marg / 2 + (marg & width & 1);
5019
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005020 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021}
5022
Marc-André Lemburge5034372000-08-08 08:04:29 +00005023#if 0
5024
5025/* This code should go into some future Unicode collation support
5026 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005027 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005028
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005029/* speedy UTF-16 code point order comparison */
5030/* gleaned from: */
5031/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5032
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005033static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005034{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005035 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005036 0, 0, 0, 0, 0, 0, 0, 0,
5037 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005038 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005039};
5040
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041static int
5042unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005044 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005045
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 Py_UNICODE *s1 = str1->str;
5047 Py_UNICODE *s2 = str2->str;
5048
5049 len1 = str1->length;
5050 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005051
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005053 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005054
5055 c1 = *s1++;
5056 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005057
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005058 if (c1 > (1<<11) * 26)
5059 c1 += utf16Fixup[c1>>11];
5060 if (c2 > (1<<11) * 26)
5061 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005062 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005063
5064 if (c1 != c2)
5065 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005066
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005067 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068 }
5069
5070 return (len1 < len2) ? -1 : (len1 != len2);
5071}
5072
Marc-André Lemburge5034372000-08-08 08:04:29 +00005073#else
5074
5075static int
5076unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5077{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005078 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005079
5080 Py_UNICODE *s1 = str1->str;
5081 Py_UNICODE *s2 = str2->str;
5082
5083 len1 = str1->length;
5084 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005085
Marc-André Lemburge5034372000-08-08 08:04:29 +00005086 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005087 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005088
Fredrik Lundh45714e92001-06-26 16:39:36 +00005089 c1 = *s1++;
5090 c2 = *s2++;
5091
5092 if (c1 != c2)
5093 return (c1 < c2) ? -1 : 1;
5094
Marc-André Lemburge5034372000-08-08 08:04:29 +00005095 len1--; len2--;
5096 }
5097
5098 return (len1 < len2) ? -1 : (len1 != len2);
5099}
5100
5101#endif
5102
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103int PyUnicode_Compare(PyObject *left,
5104 PyObject *right)
5105{
5106 PyUnicodeObject *u = NULL, *v = NULL;
5107 int result;
5108
5109 /* Coerce the two arguments */
5110 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5111 if (u == NULL)
5112 goto onError;
5113 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5114 if (v == NULL)
5115 goto onError;
5116
Thomas Wouters7e474022000-07-16 12:04:32 +00005117 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 if (v == u) {
5119 Py_DECREF(u);
5120 Py_DECREF(v);
5121 return 0;
5122 }
5123
5124 result = unicode_compare(u, v);
5125
5126 Py_DECREF(u);
5127 Py_DECREF(v);
5128 return result;
5129
5130onError:
5131 Py_XDECREF(u);
5132 Py_XDECREF(v);
5133 return -1;
5134}
5135
Guido van Rossum403d68b2000-03-13 15:55:09 +00005136int PyUnicode_Contains(PyObject *container,
5137 PyObject *element)
5138{
Fredrik Lundh833bf942006-05-23 10:12:21 +00005139 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005140 Py_ssize_t size;
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005141 Py_ssize_t pos;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005142
5143 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00005144 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
5145 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005146 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005147 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005148 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005149 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005150
5151 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
5152 if (!u) {
5153 Py_DECREF(v);
5154 return -1;
5155 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005156
Barry Warsaw817918c2002-08-06 16:58:21 +00005157 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005158 if (!size) {
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00005159 pos = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005160 goto done;
5161 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005162
Fredrik Lundhd5e0dc52006-05-24 15:11:01 +00005163 pos = fastsearch(
5164 PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
5165 PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
5166 );
Guido van Rossum403d68b2000-03-13 15:55:09 +00005167
Fredrik Lundh833bf942006-05-23 10:12:21 +00005168done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005169 Py_DECREF(u);
5170 Py_DECREF(v);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00005171 return (pos != -1);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005172}
5173
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174/* Concat to string or Unicode object giving a new Unicode object. */
5175
5176PyObject *PyUnicode_Concat(PyObject *left,
5177 PyObject *right)
5178{
5179 PyUnicodeObject *u = NULL, *v = NULL, *w;
5180
5181 /* Coerce the two arguments */
5182 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5183 if (u == NULL)
5184 goto onError;
5185 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5186 if (v == NULL)
5187 goto onError;
5188
5189 /* Shortcuts */
5190 if (v == unicode_empty) {
5191 Py_DECREF(v);
5192 return (PyObject *)u;
5193 }
5194 if (u == unicode_empty) {
5195 Py_DECREF(u);
5196 return (PyObject *)v;
5197 }
5198
5199 /* Concat the two Unicode strings */
5200 w = _PyUnicode_New(u->length + v->length);
5201 if (w == NULL)
5202 goto onError;
5203 Py_UNICODE_COPY(w->str, u->str, u->length);
5204 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5205
5206 Py_DECREF(u);
5207 Py_DECREF(v);
5208 return (PyObject *)w;
5209
5210onError:
5211 Py_XDECREF(u);
5212 Py_XDECREF(v);
5213 return NULL;
5214}
5215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005216PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217"S.count(sub[, start[, end]]) -> int\n\
5218\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005219Return the number of non-overlapping occurrences of substring sub in\n\
5220Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005221interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
5223static PyObject *
5224unicode_count(PyUnicodeObject *self, PyObject *args)
5225{
5226 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005227 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005228 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 PyObject *result;
5230
Guido van Rossumb8872e62000-05-09 14:14:27 +00005231 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5232 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 return NULL;
5234
5235 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5236 (PyObject *)substring);
5237 if (substring == NULL)
5238 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005239
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 if (start < 0)
5241 start += self->length;
5242 if (start < 0)
5243 start = 0;
5244 if (end > self->length)
5245 end = self->length;
5246 if (end < 0)
5247 end += self->length;
5248 if (end < 0)
5249 end = 0;
5250
Andrew Dalkeb552c4d2006-05-25 18:03:25 +00005251 result = PyInt_FromSsize_t(count(self, start, end, substring));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
5253 Py_DECREF(substring);
5254 return result;
5255}
5256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005257PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005258"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005260Encodes S using the codec registered for encoding. encoding defaults\n\
5261to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005262handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5264'xmlcharrefreplace' as well as any other name registered with\n\
5265codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
5267static PyObject *
5268unicode_encode(PyUnicodeObject *self, PyObject *args)
5269{
5270 char *encoding = NULL;
5271 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005272 PyObject *v;
5273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5275 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005276 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005277 if (v == NULL)
5278 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005279 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5280 PyErr_Format(PyExc_TypeError,
5281 "encoder did not return a string/unicode object "
5282 "(type=%.400s)",
5283 v->ob_type->tp_name);
5284 Py_DECREF(v);
5285 return NULL;
5286 }
5287 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005288
5289 onError:
5290 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005291}
5292
5293PyDoc_STRVAR(decode__doc__,
5294"S.decode([encoding[,errors]]) -> string or unicode\n\
5295\n\
5296Decodes S using the codec registered for encoding. encoding defaults\n\
5297to the default encoding. errors may be given to set a different error\n\
5298handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5299a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5300as well as any other name registerd with codecs.register_error that is\n\
5301able to handle UnicodeDecodeErrors.");
5302
5303static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005304unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005305{
5306 char *encoding = NULL;
5307 char *errors = NULL;
5308 PyObject *v;
5309
5310 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5311 return NULL;
5312 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005313 if (v == NULL)
5314 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005315 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5316 PyErr_Format(PyExc_TypeError,
5317 "decoder did not return a string/unicode object "
5318 "(type=%.400s)",
5319 v->ob_type->tp_name);
5320 Py_DECREF(v);
5321 return NULL;
5322 }
5323 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005324
5325 onError:
5326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327}
5328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005329PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330"S.expandtabs([tabsize]) -> unicode\n\
5331\n\
5332Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005333If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334
5335static PyObject*
5336unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5337{
5338 Py_UNICODE *e;
5339 Py_UNICODE *p;
5340 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005341 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 PyUnicodeObject *u;
5343 int tabsize = 8;
5344
5345 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5346 return NULL;
5347
Thomas Wouters7e474022000-07-16 12:04:32 +00005348 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 i = j = 0;
5350 e = self->str + self->length;
5351 for (p = self->str; p < e; p++)
5352 if (*p == '\t') {
5353 if (tabsize > 0)
5354 j += tabsize - (j % tabsize);
5355 }
5356 else {
5357 j++;
5358 if (*p == '\n' || *p == '\r') {
5359 i += j;
5360 j = 0;
5361 }
5362 }
5363
5364 /* Second pass: create output string and fill it */
5365 u = _PyUnicode_New(i + j);
5366 if (!u)
5367 return NULL;
5368
5369 j = 0;
5370 q = u->str;
5371
5372 for (p = self->str; p < e; p++)
5373 if (*p == '\t') {
5374 if (tabsize > 0) {
5375 i = tabsize - (j % tabsize);
5376 j += i;
5377 while (i--)
5378 *q++ = ' ';
5379 }
5380 }
5381 else {
5382 j++;
5383 *q++ = *p;
5384 if (*p == '\n' || *p == '\r')
5385 j = 0;
5386 }
5387
5388 return (PyObject*) u;
5389}
5390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005391PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392"S.find(sub [,start [,end]]) -> int\n\
5393\n\
5394Return the lowest index in S where substring sub is found,\n\
5395such that sub is contained within s[start,end]. Optional\n\
5396arguments start and end are interpreted as in slice notation.\n\
5397\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005398Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399
5400static PyObject *
5401unicode_find(PyUnicodeObject *self, PyObject *args)
5402{
5403 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005404 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005405 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 PyObject *result;
5407
Guido van Rossumb8872e62000-05-09 14:14:27 +00005408 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5409 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 return NULL;
5411 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5412 (PyObject *)substring);
5413 if (substring == NULL)
5414 return NULL;
5415
Martin v. Löwis18e16552006-02-15 17:27:45 +00005416 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417
5418 Py_DECREF(substring);
5419 return result;
5420}
5421
5422static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005423unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424{
5425 if (index < 0 || index >= self->length) {
5426 PyErr_SetString(PyExc_IndexError, "string index out of range");
5427 return NULL;
5428 }
5429
5430 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5431}
5432
5433static long
5434unicode_hash(PyUnicodeObject *self)
5435{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005436 /* Since Unicode objects compare equal to their ASCII string
5437 counterparts, they should use the individual character values
5438 as basis for their hash value. This is needed to assure that
5439 strings and Unicode objects behave in the same way as
5440 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
Martin v. Löwis18e16552006-02-15 17:27:45 +00005442 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005443 register Py_UNICODE *p;
5444 register long x;
5445
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 if (self->hash != -1)
5447 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005448 len = PyUnicode_GET_SIZE(self);
5449 p = PyUnicode_AS_UNICODE(self);
5450 x = *p << 7;
5451 while (--len >= 0)
5452 x = (1000003*x) ^ *p++;
5453 x ^= PyUnicode_GET_SIZE(self);
5454 if (x == -1)
5455 x = -2;
5456 self->hash = x;
5457 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458}
5459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005460PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461"S.index(sub [,start [,end]]) -> int\n\
5462\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005463Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
5465static PyObject *
5466unicode_index(PyUnicodeObject *self, PyObject *args)
5467{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005468 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005470 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005471 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472
Guido van Rossumb8872e62000-05-09 14:14:27 +00005473 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5474 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5478 (PyObject *)substring);
5479 if (substring == NULL)
5480 return NULL;
5481
5482 result = findstring(self, substring, start, end, 1);
5483
5484 Py_DECREF(substring);
5485 if (result < 0) {
5486 PyErr_SetString(PyExc_ValueError, "substring not found");
5487 return NULL;
5488 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490}
5491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005492PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005493"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005495Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005496at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497
5498static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005499unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500{
5501 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5502 register const Py_UNICODE *e;
5503 int cased;
5504
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 /* Shortcut for single character strings */
5506 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005507 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005509 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005510 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005511 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005512
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 e = p + PyUnicode_GET_SIZE(self);
5514 cased = 0;
5515 for (; p < e; p++) {
5516 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005517
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005519 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 else if (!cased && Py_UNICODE_ISLOWER(ch))
5521 cased = 1;
5522 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005523 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524}
5525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005526PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005527"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005529Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005530at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531
5532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005533unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534{
5535 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5536 register const Py_UNICODE *e;
5537 int cased;
5538
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 /* Shortcut for single character strings */
5540 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005541 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005543 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005544 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005545 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 e = p + PyUnicode_GET_SIZE(self);
5548 cased = 0;
5549 for (; p < e; p++) {
5550 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005551
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005553 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 else if (!cased && Py_UNICODE_ISUPPER(ch))
5555 cased = 1;
5556 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005557 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558}
5559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005560PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005561"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005563Return True if S is a titlecased string and there is at least one\n\
5564character in S, i.e. upper- and titlecase characters may only\n\
5565follow uncased characters and lowercase characters only cased ones.\n\
5566Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
5568static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005569unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570{
5571 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5572 register const Py_UNICODE *e;
5573 int cased, previous_is_cased;
5574
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 /* Shortcut for single character strings */
5576 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005577 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5578 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005580 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005581 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005582 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005583
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 e = p + PyUnicode_GET_SIZE(self);
5585 cased = 0;
5586 previous_is_cased = 0;
5587 for (; p < e; p++) {
5588 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5591 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005592 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 previous_is_cased = 1;
5594 cased = 1;
5595 }
5596 else if (Py_UNICODE_ISLOWER(ch)) {
5597 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005598 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 previous_is_cased = 1;
5600 cased = 1;
5601 }
5602 else
5603 previous_is_cased = 0;
5604 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005605 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606}
5607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005608PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005609"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005611Return True if all characters in S are whitespace\n\
5612and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
5614static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005615unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616{
5617 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5618 register const Py_UNICODE *e;
5619
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 /* Shortcut for single character strings */
5621 if (PyUnicode_GET_SIZE(self) == 1 &&
5622 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005623 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005625 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005626 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005627 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005628
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 e = p + PyUnicode_GET_SIZE(self);
5630 for (; p < e; p++) {
5631 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005632 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005634 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635}
5636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005637PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005638"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005639\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005640Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005641and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005642
5643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005644unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005645{
5646 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5647 register const Py_UNICODE *e;
5648
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005649 /* Shortcut for single character strings */
5650 if (PyUnicode_GET_SIZE(self) == 1 &&
5651 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005652 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005653
5654 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005655 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005656 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005657
5658 e = p + PyUnicode_GET_SIZE(self);
5659 for (; p < e; p++) {
5660 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005661 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005663 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005664}
5665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005666PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005667"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005669Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005670and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005671
5672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005673unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005674{
5675 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5676 register const Py_UNICODE *e;
5677
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005678 /* Shortcut for single character strings */
5679 if (PyUnicode_GET_SIZE(self) == 1 &&
5680 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005681 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005682
5683 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005684 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005685 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005686
5687 e = p + PyUnicode_GET_SIZE(self);
5688 for (; p < e; p++) {
5689 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005690 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005691 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005692 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005693}
5694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005695PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005696"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005698Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005699False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
5701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005702unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703{
5704 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5705 register const Py_UNICODE *e;
5706
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 /* Shortcut for single character strings */
5708 if (PyUnicode_GET_SIZE(self) == 1 &&
5709 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005710 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005712 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005713 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005714 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005715
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 e = p + PyUnicode_GET_SIZE(self);
5717 for (; p < e; p++) {
5718 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005719 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005721 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722}
5723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005724PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005725"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005727Return True if all characters in S are digits\n\
5728and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
5730static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005731unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732{
5733 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5734 register const Py_UNICODE *e;
5735
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 /* Shortcut for single character strings */
5737 if (PyUnicode_GET_SIZE(self) == 1 &&
5738 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005739 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005741 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005742 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005743 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005744
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 e = p + PyUnicode_GET_SIZE(self);
5746 for (; p < e; p++) {
5747 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005748 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005750 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751}
5752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005753PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005754"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005756Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005757False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
5759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005760unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761{
5762 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5763 register const Py_UNICODE *e;
5764
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 /* Shortcut for single character strings */
5766 if (PyUnicode_GET_SIZE(self) == 1 &&
5767 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005768 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005770 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005771 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005772 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005773
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 e = p + PyUnicode_GET_SIZE(self);
5775 for (; p < e; p++) {
5776 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005777 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005779 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780}
5781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005782PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783"S.join(sequence) -> unicode\n\
5784\n\
5785Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005786sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787
5788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005789unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005791 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792}
5793
Martin v. Löwis18e16552006-02-15 17:27:45 +00005794static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795unicode_length(PyUnicodeObject *self)
5796{
5797 return self->length;
5798}
5799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005800PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005801"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802\n\
5803Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005804done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805
5806static PyObject *
5807unicode_ljust(PyUnicodeObject *self, PyObject *args)
5808{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005809 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005810 Py_UNICODE fillchar = ' ';
5811
Martin v. Löwis412fb672006-04-13 06:34:32 +00005812 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 return NULL;
5814
Tim Peters7a29bd52001-09-12 03:03:31 +00005815 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 Py_INCREF(self);
5817 return (PyObject*) self;
5818 }
5819
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005820 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821}
5822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005823PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824"S.lower() -> unicode\n\
5825\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005826Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827
5828static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005829unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 return fixup(self, fixlower);
5832}
5833
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005834#define LEFTSTRIP 0
5835#define RIGHTSTRIP 1
5836#define BOTHSTRIP 2
5837
5838/* Arrays indexed by above */
5839static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5840
5841#define STRIPNAME(i) (stripformat[i]+3)
5842
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005843/* externally visible for str.strip(unicode) */
5844PyObject *
5845_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5846{
5847 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005848 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005849 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005850 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5851 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005852
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005853 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5854
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005855 i = 0;
5856 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005857 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5858 i++;
5859 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005860 }
5861
5862 j = len;
5863 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005864 do {
5865 j--;
5866 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5867 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005868 }
5869
5870 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005871 Py_INCREF(self);
5872 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005873 }
5874 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005875 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005876}
5877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
5879static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005880do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005882 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005883 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005884
5885 i = 0;
5886 if (striptype != RIGHTSTRIP) {
5887 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5888 i++;
5889 }
5890 }
5891
5892 j = len;
5893 if (striptype != LEFTSTRIP) {
5894 do {
5895 j--;
5896 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5897 j++;
5898 }
5899
5900 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5901 Py_INCREF(self);
5902 return (PyObject*)self;
5903 }
5904 else
5905 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906}
5907
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005908
5909static PyObject *
5910do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5911{
5912 PyObject *sep = NULL;
5913
5914 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5915 return NULL;
5916
5917 if (sep != NULL && sep != Py_None) {
5918 if (PyUnicode_Check(sep))
5919 return _PyUnicode_XStrip(self, striptype, sep);
5920 else if (PyString_Check(sep)) {
5921 PyObject *res;
5922 sep = PyUnicode_FromObject(sep);
5923 if (sep==NULL)
5924 return NULL;
5925 res = _PyUnicode_XStrip(self, striptype, sep);
5926 Py_DECREF(sep);
5927 return res;
5928 }
5929 else {
5930 PyErr_Format(PyExc_TypeError,
5931 "%s arg must be None, unicode or str",
5932 STRIPNAME(striptype));
5933 return NULL;
5934 }
5935 }
5936
5937 return do_strip(self, striptype);
5938}
5939
5940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005941PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005942"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005943\n\
5944Return a copy of the string S with leading and trailing\n\
5945whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005946If chars is given and not None, remove characters in chars instead.\n\
5947If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005948
5949static PyObject *
5950unicode_strip(PyUnicodeObject *self, PyObject *args)
5951{
5952 if (PyTuple_GET_SIZE(args) == 0)
5953 return do_strip(self, BOTHSTRIP); /* Common case */
5954 else
5955 return do_argstrip(self, BOTHSTRIP, args);
5956}
5957
5958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005959PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005960"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005961\n\
5962Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005963If chars is given and not None, remove characters in chars instead.\n\
5964If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005965
5966static PyObject *
5967unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5968{
5969 if (PyTuple_GET_SIZE(args) == 0)
5970 return do_strip(self, LEFTSTRIP); /* Common case */
5971 else
5972 return do_argstrip(self, LEFTSTRIP, args);
5973}
5974
5975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005976PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005977"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005978\n\
5979Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005980If chars is given and not None, remove characters in chars instead.\n\
5981If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005982
5983static PyObject *
5984unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5985{
5986 if (PyTuple_GET_SIZE(args) == 0)
5987 return do_strip(self, RIGHTSTRIP); /* Common case */
5988 else
5989 return do_argstrip(self, RIGHTSTRIP, args);
5990}
5991
5992
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005994unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
5996 PyUnicodeObject *u;
5997 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005998 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005999 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000
6001 if (len < 0)
6002 len = 0;
6003
Tim Peters7a29bd52001-09-12 03:03:31 +00006004 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 /* no repeat, return original string */
6006 Py_INCREF(str);
6007 return (PyObject*) str;
6008 }
Tim Peters8f422462000-09-09 06:13:41 +00006009
6010 /* ensure # of chars needed doesn't overflow int and # of bytes
6011 * needed doesn't overflow size_t
6012 */
6013 nchars = len * str->length;
6014 if (len && nchars / len != str->length) {
6015 PyErr_SetString(PyExc_OverflowError,
6016 "repeated string is too long");
6017 return NULL;
6018 }
6019 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6020 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6021 PyErr_SetString(PyExc_OverflowError,
6022 "repeated string is too long");
6023 return NULL;
6024 }
6025 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 if (!u)
6027 return NULL;
6028
6029 p = u->str;
6030
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006031 if (str->length == 1 && len > 0) {
6032 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006033 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006034 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006035 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006036 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006037 done = str->length;
6038 }
6039 while (done < nchars) {
6040 int n = (done <= nchars-done) ? done : nchars-done;
6041 Py_UNICODE_COPY(p+done, p, n);
6042 done += n;
6043 }
6044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
6046 return (PyObject*) u;
6047}
6048
6049PyObject *PyUnicode_Replace(PyObject *obj,
6050 PyObject *subobj,
6051 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006052 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
6054 PyObject *self;
6055 PyObject *str1;
6056 PyObject *str2;
6057 PyObject *result;
6058
6059 self = PyUnicode_FromObject(obj);
6060 if (self == NULL)
6061 return NULL;
6062 str1 = PyUnicode_FromObject(subobj);
6063 if (str1 == NULL) {
6064 Py_DECREF(self);
6065 return NULL;
6066 }
6067 str2 = PyUnicode_FromObject(replobj);
6068 if (str2 == NULL) {
6069 Py_DECREF(self);
6070 Py_DECREF(str1);
6071 return NULL;
6072 }
Tim Petersced69f82003-09-16 20:30:58 +00006073 result = replace((PyUnicodeObject *)self,
6074 (PyUnicodeObject *)str1,
6075 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 maxcount);
6077 Py_DECREF(self);
6078 Py_DECREF(str1);
6079 Py_DECREF(str2);
6080 return result;
6081}
6082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006083PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084"S.replace (old, new[, maxsplit]) -> unicode\n\
6085\n\
6086Return a copy of S with all occurrences of substring\n\
6087old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006088given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
6090static PyObject*
6091unicode_replace(PyUnicodeObject *self, PyObject *args)
6092{
6093 PyUnicodeObject *str1;
6094 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006095 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 PyObject *result;
6097
Martin v. Löwis18e16552006-02-15 17:27:45 +00006098 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return NULL;
6100 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6101 if (str1 == NULL)
6102 return NULL;
6103 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006104 if (str2 == NULL) {
6105 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108
6109 result = replace(self, str1, str2, maxcount);
6110
6111 Py_DECREF(str1);
6112 Py_DECREF(str2);
6113 return result;
6114}
6115
6116static
6117PyObject *unicode_repr(PyObject *unicode)
6118{
6119 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6120 PyUnicode_GET_SIZE(unicode),
6121 1);
6122}
6123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006124PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125"S.rfind(sub [,start [,end]]) -> int\n\
6126\n\
6127Return the highest index in S where substring sub is found,\n\
6128such that sub is contained within s[start,end]. Optional\n\
6129arguments start and end are interpreted as in slice notation.\n\
6130\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006131Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
6133static PyObject *
6134unicode_rfind(PyUnicodeObject *self, PyObject *args)
6135{
6136 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006137 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006138 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 PyObject *result;
6140
Guido van Rossumb8872e62000-05-09 14:14:27 +00006141 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6142 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 return NULL;
6144 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6145 (PyObject *)substring);
6146 if (substring == NULL)
6147 return NULL;
6148
Martin v. Löwis18e16552006-02-15 17:27:45 +00006149 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
6151 Py_DECREF(substring);
6152 return result;
6153}
6154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006155PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156"S.rindex(sub [,start [,end]]) -> int\n\
6157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006158Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
6160static PyObject *
6161unicode_rindex(PyUnicodeObject *self, PyObject *args)
6162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006163 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006165 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006166 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
Guido van Rossumb8872e62000-05-09 14:14:27 +00006168 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6169 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 return NULL;
6171 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6172 (PyObject *)substring);
6173 if (substring == NULL)
6174 return NULL;
6175
6176 result = findstring(self, substring, start, end, -1);
6177
6178 Py_DECREF(substring);
6179 if (result < 0) {
6180 PyErr_SetString(PyExc_ValueError, "substring not found");
6181 return NULL;
6182 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006183 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006186PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006187"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188\n\
6189Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006190done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
6192static PyObject *
6193unicode_rjust(PyUnicodeObject *self, PyObject *args)
6194{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006195 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006196 Py_UNICODE fillchar = ' ';
6197
Martin v. Löwis412fb672006-04-13 06:34:32 +00006198 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 return NULL;
6200
Tim Peters7a29bd52001-09-12 03:03:31 +00006201 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 Py_INCREF(self);
6203 return (PyObject*) self;
6204 }
6205
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006206 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207}
6208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006210unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211{
6212 /* standard clamping */
6213 if (start < 0)
6214 start = 0;
6215 if (end < 0)
6216 end = 0;
6217 if (end > self->length)
6218 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006219 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 /* full slice, return original string */
6221 Py_INCREF(self);
6222 return (PyObject*) self;
6223 }
6224 if (start > end)
6225 start = end;
6226 /* copy slice */
6227 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6228 end - start);
6229}
6230
6231PyObject *PyUnicode_Split(PyObject *s,
6232 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006233 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234{
6235 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006236
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 s = PyUnicode_FromObject(s);
6238 if (s == NULL)
6239 return NULL;
6240 if (sep != NULL) {
6241 sep = PyUnicode_FromObject(sep);
6242 if (sep == NULL) {
6243 Py_DECREF(s);
6244 return NULL;
6245 }
6246 }
6247
6248 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6249
6250 Py_DECREF(s);
6251 Py_XDECREF(sep);
6252 return result;
6253}
6254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006255PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256"S.split([sep [,maxsplit]]) -> list of strings\n\
6257\n\
6258Return a list of the words in S, using sep as the\n\
6259delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006260splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006261any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
6263static PyObject*
6264unicode_split(PyUnicodeObject *self, PyObject *args)
6265{
6266 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006267 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268
Martin v. Löwis18e16552006-02-15 17:27:45 +00006269 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 return NULL;
6271
6272 if (substring == Py_None)
6273 return split(self, NULL, maxcount);
6274 else if (PyUnicode_Check(substring))
6275 return split(self, (PyUnicodeObject *)substring, maxcount);
6276 else
6277 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6278}
6279
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006280PyObject *
6281PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6282{
6283 PyObject* str_obj;
6284 PyObject* sep_obj;
6285 Py_UNICODE *str, *sep;
6286 Py_ssize_t len, sep_len, pos;
6287 PyObject* out;
6288
6289 str_obj = PyUnicode_FromObject(str_in);
6290 if (!str_obj)
6291 return NULL;
6292 sep_obj = PyUnicode_FromObject(sep_in);
6293 if (!sep_obj)
6294 goto error;
6295
6296 str = PyUnicode_AS_UNICODE(str_obj);
6297 len = PyUnicode_GET_SIZE(str_obj);
6298
6299 sep = PyUnicode_AS_UNICODE(sep_obj);
6300 sep_len = PyUnicode_GET_SIZE(sep_obj);
6301
6302 if (sep_len == 0) {
6303 PyErr_SetString(PyExc_ValueError, "empty separator");
6304 goto error;
6305 }
6306
6307 out = PyTuple_New(3);
6308 if (!out)
6309 goto error;
6310
6311 pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH);
6312 if (pos < 0) {
6313 Py_INCREF(str_obj);
6314 PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
6315 Py_INCREF(unicode_empty);
6316 PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty);
6317 Py_INCREF(unicode_empty);
6318 PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty);
6319 } else {
6320 PyObject* obj;
6321 PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos));
6322 Py_INCREF(sep_obj);
6323 PyTuple_SET_ITEM(out, 1, sep_obj);
6324 obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos);
6325 PyTuple_SET_ITEM(out, 2, obj);
6326 if (PyErr_Occurred()) {
6327 Py_DECREF(out);
6328 goto error;
6329 }
6330 }
6331
6332 return out;
6333
6334error:
6335 Py_XDECREF(sep_obj);
6336 Py_DECREF(str_obj);
6337 return NULL;
6338}
6339
6340PyDoc_STRVAR(partition__doc__,
6341"S.partition(sep) -> (head, sep, tail)\n\
6342\n\
6343Searches for the separator sep in S, and returns the part before it,\n\
6344the separator itself, and the part after it. If the separator is not\n\
6345found, returns S and two empty strings.");
6346
6347static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006348unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006349{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006350 return PyUnicode_Partition((PyObject *)self, separator);
6351}
6352
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006353PyObject *PyUnicode_RSplit(PyObject *s,
6354 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006355 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006356{
6357 PyObject *result;
6358
6359 s = PyUnicode_FromObject(s);
6360 if (s == NULL)
6361 return NULL;
6362 if (sep != NULL) {
6363 sep = PyUnicode_FromObject(sep);
6364 if (sep == NULL) {
6365 Py_DECREF(s);
6366 return NULL;
6367 }
6368 }
6369
6370 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6371
6372 Py_DECREF(s);
6373 Py_XDECREF(sep);
6374 return result;
6375}
6376
6377PyDoc_STRVAR(rsplit__doc__,
6378"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6379\n\
6380Return a list of the words in S, using sep as the\n\
6381delimiter string, starting at the end of the string and\n\
6382working to the front. If maxsplit is given, at most maxsplit\n\
6383splits are done. If sep is not specified, any whitespace string\n\
6384is a separator.");
6385
6386static PyObject*
6387unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6388{
6389 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006390 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006391
Martin v. Löwis18e16552006-02-15 17:27:45 +00006392 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006393 return NULL;
6394
6395 if (substring == Py_None)
6396 return rsplit(self, NULL, maxcount);
6397 else if (PyUnicode_Check(substring))
6398 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6399 else
6400 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6401}
6402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006403PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006404"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405\n\
6406Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006407Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006408is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
6410static PyObject*
6411unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6412{
Guido van Rossum86662912000-04-11 15:38:46 +00006413 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
Guido van Rossum86662912000-04-11 15:38:46 +00006415 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 return NULL;
6417
Guido van Rossum86662912000-04-11 15:38:46 +00006418 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419}
6420
6421static
6422PyObject *unicode_str(PyUnicodeObject *self)
6423{
Fred Drakee4315f52000-05-09 19:53:39 +00006424 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425}
6426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006427PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428"S.swapcase() -> unicode\n\
6429\n\
6430Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006431and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
6433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006434unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 return fixup(self, fixswapcase);
6437}
6438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006439PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440"S.translate(table) -> unicode\n\
6441\n\
6442Return a copy of the string S, where all characters have been mapped\n\
6443through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006444Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6445Unmapped characters are left untouched. Characters mapped to None\n\
6446are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447
6448static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006449unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450{
Tim Petersced69f82003-09-16 20:30:58 +00006451 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006453 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 "ignore");
6455}
6456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006457PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458"S.upper() -> unicode\n\
6459\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006460Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
6462static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006463unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 return fixup(self, fixupper);
6466}
6467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006468PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469"S.zfill(width) -> unicode\n\
6470\n\
6471Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006472of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473
6474static PyObject *
6475unicode_zfill(PyUnicodeObject *self, PyObject *args)
6476{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006477 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 PyUnicodeObject *u;
6479
Martin v. Löwis18e16552006-02-15 17:27:45 +00006480 Py_ssize_t width;
6481 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 return NULL;
6483
6484 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006485 if (PyUnicode_CheckExact(self)) {
6486 Py_INCREF(self);
6487 return (PyObject*) self;
6488 }
6489 else
6490 return PyUnicode_FromUnicode(
6491 PyUnicode_AS_UNICODE(self),
6492 PyUnicode_GET_SIZE(self)
6493 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 }
6495
6496 fill = width - self->length;
6497
6498 u = pad(self, fill, 0, '0');
6499
Walter Dörwald068325e2002-04-15 13:36:47 +00006500 if (u == NULL)
6501 return NULL;
6502
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 if (u->str[fill] == '+' || u->str[fill] == '-') {
6504 /* move sign to beginning of string */
6505 u->str[0] = u->str[fill];
6506 u->str[fill] = '0';
6507 }
6508
6509 return (PyObject*) u;
6510}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511
6512#if 0
6513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006514unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 return PyInt_FromLong(unicode_freelist_size);
6517}
6518#endif
6519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006520PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006521"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006523Return True if S starts with the specified prefix, False otherwise.\n\
6524With optional start, test S beginning at that position.\n\
6525With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
6527static PyObject *
6528unicode_startswith(PyUnicodeObject *self,
6529 PyObject *args)
6530{
6531 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006532 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006533 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 PyObject *result;
6535
Guido van Rossumb8872e62000-05-09 14:14:27 +00006536 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6537 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 return NULL;
6539 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6540 (PyObject *)substring);
6541 if (substring == NULL)
6542 return NULL;
6543
Guido van Rossum77f6a652002-04-03 22:41:51 +00006544 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545
6546 Py_DECREF(substring);
6547 return result;
6548}
6549
6550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006551PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006552"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006554Return True if S ends with the specified suffix, False otherwise.\n\
6555With optional start, test S beginning at that position.\n\
6556With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557
6558static PyObject *
6559unicode_endswith(PyUnicodeObject *self,
6560 PyObject *args)
6561{
6562 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006563 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006564 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 PyObject *result;
6566
Guido van Rossumb8872e62000-05-09 14:14:27 +00006567 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6568 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 return NULL;
6570 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6571 (PyObject *)substring);
6572 if (substring == NULL)
6573 return NULL;
6574
Guido van Rossum77f6a652002-04-03 22:41:51 +00006575 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576
6577 Py_DECREF(substring);
6578 return result;
6579}
6580
6581
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006582
6583static PyObject *
6584unicode_getnewargs(PyUnicodeObject *v)
6585{
6586 return Py_BuildValue("(u#)", v->str, v->length);
6587}
6588
6589
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590static PyMethodDef unicode_methods[] = {
6591
6592 /* Order is according to common usage: often used methods should
6593 appear first, since lookup is done sequentially. */
6594
Georg Brandlecdc0a92006-03-30 12:19:07 +00006595 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006596 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6597 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006598 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006599 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6600 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6601 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6602 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6603 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6604 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6605 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00006606 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006607 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6608 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6609 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006610 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006611 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006612/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6613 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6614 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6615 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006616 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006617 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006618 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006619 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6620 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6621 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6622 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6623 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6624 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6625 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6626 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6627 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6628 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6629 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6630 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6631 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6632 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006633 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006634#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006635 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636#endif
6637
6638#if 0
6639 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006640 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641#endif
6642
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006643 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 {NULL, NULL}
6645};
6646
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006647static PyObject *
6648unicode_mod(PyObject *v, PyObject *w)
6649{
6650 if (!PyUnicode_Check(v)) {
6651 Py_INCREF(Py_NotImplemented);
6652 return Py_NotImplemented;
6653 }
6654 return PyUnicode_Format(v, w);
6655}
6656
6657static PyNumberMethods unicode_as_number = {
6658 0, /*nb_add*/
6659 0, /*nb_subtract*/
6660 0, /*nb_multiply*/
6661 0, /*nb_divide*/
6662 unicode_mod, /*nb_remainder*/
6663};
6664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006666 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006667 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006668 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6669 (ssizeargfunc) unicode_getitem, /* sq_item */
6670 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 0, /* sq_ass_item */
6672 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006673 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674};
6675
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006676#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6677
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006678static PyObject*
6679unicode_subscript(PyUnicodeObject* self, PyObject* item)
6680{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006681 PyNumberMethods *nb = item->ob_type->tp_as_number;
6682 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6683 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006684 if (i == -1 && PyErr_Occurred())
6685 return NULL;
6686 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006687 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006688 return unicode_getitem(self, i);
6689 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006690 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006691 Py_UNICODE* source_buf;
6692 Py_UNICODE* result_buf;
6693 PyObject* result;
6694
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006695 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006696 &start, &stop, &step, &slicelength) < 0) {
6697 return NULL;
6698 }
6699
6700 if (slicelength <= 0) {
6701 return PyUnicode_FromUnicode(NULL, 0);
6702 } else {
6703 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006704 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6705 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006706
6707 if (result_buf == NULL)
6708 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006709
6710 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6711 result_buf[i] = source_buf[cur];
6712 }
Tim Petersced69f82003-09-16 20:30:58 +00006713
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006714 result = PyUnicode_FromUnicode(result_buf, slicelength);
6715 PyMem_FREE(result_buf);
6716 return result;
6717 }
6718 } else {
6719 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6720 return NULL;
6721 }
6722}
6723
6724static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006725 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006726 (binaryfunc)unicode_subscript, /* mp_subscript */
6727 (objobjargproc)0, /* mp_ass_subscript */
6728};
6729
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006732 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 const void **ptr)
6734{
6735 if (index != 0) {
6736 PyErr_SetString(PyExc_SystemError,
6737 "accessing non-existent unicode segment");
6738 return -1;
6739 }
6740 *ptr = (void *) self->str;
6741 return PyUnicode_GET_DATA_SIZE(self);
6742}
6743
Martin v. Löwis18e16552006-02-15 17:27:45 +00006744static Py_ssize_t
6745unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 const void **ptr)
6747{
6748 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006749 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return -1;
6751}
6752
6753static int
6754unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
6757 if (lenp)
6758 *lenp = PyUnicode_GET_DATA_SIZE(self);
6759 return 1;
6760}
6761
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006762static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006764 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 const void **ptr)
6766{
6767 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 if (index != 0) {
6770 PyErr_SetString(PyExc_SystemError,
6771 "accessing non-existent unicode segment");
6772 return -1;
6773 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006774 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 if (str == NULL)
6776 return -1;
6777 *ptr = (void *) PyString_AS_STRING(str);
6778 return PyString_GET_SIZE(str);
6779}
6780
6781/* Helpers for PyUnicode_Format() */
6782
6783static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006784getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006786 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 if (argidx < arglen) {
6788 (*p_argidx)++;
6789 if (arglen < 0)
6790 return args;
6791 else
6792 return PyTuple_GetItem(args, argidx);
6793 }
6794 PyErr_SetString(PyExc_TypeError,
6795 "not enough arguments for format string");
6796 return NULL;
6797}
6798
6799#define F_LJUST (1<<0)
6800#define F_SIGN (1<<1)
6801#define F_BLANK (1<<2)
6802#define F_ALT (1<<3)
6803#define F_ZERO (1<<4)
6804
Martin v. Löwis18e16552006-02-15 17:27:45 +00006805static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006806strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006808 register Py_ssize_t i;
6809 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 for (i = len - 1; i >= 0; i--)
6811 buffer[i] = (Py_UNICODE) charbuffer[i];
6812
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 return len;
6814}
6815
Neal Norwitzfc76d632006-01-10 06:03:13 +00006816static int
6817doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6818{
Tim Peters15231542006-02-16 01:08:01 +00006819 Py_ssize_t result;
6820
Neal Norwitzfc76d632006-01-10 06:03:13 +00006821 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006822 result = strtounicode(buffer, (char *)buffer);
6823 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006824}
6825
6826static int
6827longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6828{
Tim Peters15231542006-02-16 01:08:01 +00006829 Py_ssize_t result;
6830
Neal Norwitzfc76d632006-01-10 06:03:13 +00006831 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006832 result = strtounicode(buffer, (char *)buffer);
6833 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006834}
6835
Guido van Rossum078151d2002-08-11 04:24:12 +00006836/* XXX To save some code duplication, formatfloat/long/int could have been
6837 shared with stringobject.c, converting from 8-bit to Unicode after the
6838 formatting is done. */
6839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840static int
6841formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006842 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 int flags,
6844 int prec,
6845 int type,
6846 PyObject *v)
6847{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006848 /* fmt = '%#.' + `prec` + `type`
6849 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 char fmt[20];
6851 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 x = PyFloat_AsDouble(v);
6854 if (x == -1.0 && PyErr_Occurred())
6855 return -1;
6856 if (prec < 0)
6857 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6859 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006860 /* Worst case length calc to ensure no buffer overrun:
6861
6862 'g' formats:
6863 fmt = %#.<prec>g
6864 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6865 for any double rep.)
6866 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6867
6868 'f' formats:
6869 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6870 len = 1 + 50 + 1 + prec = 52 + prec
6871
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006872 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006873 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006874
6875 */
6876 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6877 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006878 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006879 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006880 return -1;
6881 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006882 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6883 (flags&F_ALT) ? "#" : "",
6884 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006885 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886}
6887
Tim Peters38fd5b62000-09-21 05:43:11 +00006888static PyObject*
6889formatlong(PyObject *val, int flags, int prec, int type)
6890{
6891 char *buf;
6892 int i, len;
6893 PyObject *str; /* temporary string object. */
6894 PyUnicodeObject *result;
6895
6896 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6897 if (!str)
6898 return NULL;
6899 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006900 if (!result) {
6901 Py_DECREF(str);
6902 return NULL;
6903 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006904 for (i = 0; i < len; i++)
6905 result->str[i] = buf[i];
6906 result->str[len] = 0;
6907 Py_DECREF(str);
6908 return (PyObject*)result;
6909}
6910
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911static int
6912formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006913 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 int flags,
6915 int prec,
6916 int type,
6917 PyObject *v)
6918{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006919 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006920 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6921 * + 1 + 1
6922 * = 24
6923 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006924 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006925 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 long x;
6927
6928 x = PyInt_AsLong(v);
6929 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006930 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006931 if (x < 0 && type == 'u') {
6932 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006933 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006934 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6935 sign = "-";
6936 else
6937 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006939 prec = 1;
6940
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006941 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6942 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006943 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006944 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006945 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006946 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006947 return -1;
6948 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006949
6950 if ((flags & F_ALT) &&
6951 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006952 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006953 * of issues that cause pain:
6954 * - when 0 is being converted, the C standard leaves off
6955 * the '0x' or '0X', which is inconsistent with other
6956 * %#x/%#X conversions and inconsistent with Python's
6957 * hex() function
6958 * - there are platforms that violate the standard and
6959 * convert 0 with the '0x' or '0X'
6960 * (Metrowerks, Compaq Tru64)
6961 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006962 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006963 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006964 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006965 * We can achieve the desired consistency by inserting our
6966 * own '0x' or '0X' prefix, and substituting %x/%X in place
6967 * of %#x/%#X.
6968 *
6969 * Note that this is the same approach as used in
6970 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006971 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006972 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6973 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006974 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006975 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006976 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6977 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006978 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006979 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006980 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006981 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006982 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006983 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984}
6985
6986static int
6987formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006988 size_t buflen,
6989 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006991 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006992 if (PyUnicode_Check(v)) {
6993 if (PyUnicode_GET_SIZE(v) != 1)
6994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006998 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006999 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007000 goto onError;
7001 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003
7004 else {
7005 /* Integer input truncated to a character */
7006 long x;
7007 x = PyInt_AsLong(v);
7008 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007009 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007010#ifdef Py_UNICODE_WIDE
7011 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007012 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007013 "%c arg not in range(0x110000) "
7014 "(wide Python build)");
7015 return -1;
7016 }
7017#else
7018 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007019 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007020 "%c arg not in range(0x10000) "
7021 "(narrow Python build)");
7022 return -1;
7023 }
7024#endif
7025 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 }
7027 buf[1] = '\0';
7028 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007029
7030 onError:
7031 PyErr_SetString(PyExc_TypeError,
7032 "%c requires int or char");
7033 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034}
7035
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007036/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7037
7038 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7039 chars are formatted. XXX This is a magic number. Each formatting
7040 routine does bounds checking to ensure no overflow, but a better
7041 solution may be to malloc a buffer of appropriate size for each
7042 format. For now, the current solution is sufficient.
7043*/
7044#define FORMATBUFLEN (size_t)120
7045
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046PyObject *PyUnicode_Format(PyObject *format,
7047 PyObject *args)
7048{
7049 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 int args_owned = 0;
7052 PyUnicodeObject *result = NULL;
7053 PyObject *dict = NULL;
7054 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007055
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 if (format == NULL || args == NULL) {
7057 PyErr_BadInternalCall();
7058 return NULL;
7059 }
7060 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007061 if (uformat == NULL)
7062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 fmt = PyUnicode_AS_UNICODE(uformat);
7064 fmtcnt = PyUnicode_GET_SIZE(uformat);
7065
7066 reslen = rescnt = fmtcnt + 100;
7067 result = _PyUnicode_New(reslen);
7068 if (result == NULL)
7069 goto onError;
7070 res = PyUnicode_AS_UNICODE(result);
7071
7072 if (PyTuple_Check(args)) {
7073 arglen = PyTuple_Size(args);
7074 argidx = 0;
7075 }
7076 else {
7077 arglen = -1;
7078 argidx = -2;
7079 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007080 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7081 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082 dict = args;
7083
7084 while (--fmtcnt >= 0) {
7085 if (*fmt != '%') {
7086 if (--rescnt < 0) {
7087 rescnt = fmtcnt + 100;
7088 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007089 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007090 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7092 --rescnt;
7093 }
7094 *res++ = *fmt++;
7095 }
7096 else {
7097 /* Got a format specifier */
7098 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007099 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 Py_UNICODE c = '\0';
7102 Py_UNICODE fill;
7103 PyObject *v = NULL;
7104 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007105 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007107 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007108 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109
7110 fmt++;
7111 if (*fmt == '(') {
7112 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007113 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114 PyObject *key;
7115 int pcount = 1;
7116
7117 if (dict == NULL) {
7118 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007119 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120 goto onError;
7121 }
7122 ++fmt;
7123 --fmtcnt;
7124 keystart = fmt;
7125 /* Skip over balanced parentheses */
7126 while (pcount > 0 && --fmtcnt >= 0) {
7127 if (*fmt == ')')
7128 --pcount;
7129 else if (*fmt == '(')
7130 ++pcount;
7131 fmt++;
7132 }
7133 keylen = fmt - keystart - 1;
7134 if (fmtcnt < 0 || pcount > 0) {
7135 PyErr_SetString(PyExc_ValueError,
7136 "incomplete format key");
7137 goto onError;
7138 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007139#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007140 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 then looked up since Python uses strings to hold
7142 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007143 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 key = PyUnicode_EncodeUTF8(keystart,
7145 keylen,
7146 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007147#else
7148 key = PyUnicode_FromUnicode(keystart, keylen);
7149#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 if (key == NULL)
7151 goto onError;
7152 if (args_owned) {
7153 Py_DECREF(args);
7154 args_owned = 0;
7155 }
7156 args = PyObject_GetItem(dict, key);
7157 Py_DECREF(key);
7158 if (args == NULL) {
7159 goto onError;
7160 }
7161 args_owned = 1;
7162 arglen = -1;
7163 argidx = -2;
7164 }
7165 while (--fmtcnt >= 0) {
7166 switch (c = *fmt++) {
7167 case '-': flags |= F_LJUST; continue;
7168 case '+': flags |= F_SIGN; continue;
7169 case ' ': flags |= F_BLANK; continue;
7170 case '#': flags |= F_ALT; continue;
7171 case '0': flags |= F_ZERO; continue;
7172 }
7173 break;
7174 }
7175 if (c == '*') {
7176 v = getnextarg(args, arglen, &argidx);
7177 if (v == NULL)
7178 goto onError;
7179 if (!PyInt_Check(v)) {
7180 PyErr_SetString(PyExc_TypeError,
7181 "* wants int");
7182 goto onError;
7183 }
7184 width = PyInt_AsLong(v);
7185 if (width < 0) {
7186 flags |= F_LJUST;
7187 width = -width;
7188 }
7189 if (--fmtcnt >= 0)
7190 c = *fmt++;
7191 }
7192 else if (c >= '0' && c <= '9') {
7193 width = c - '0';
7194 while (--fmtcnt >= 0) {
7195 c = *fmt++;
7196 if (c < '0' || c > '9')
7197 break;
7198 if ((width*10) / 10 != width) {
7199 PyErr_SetString(PyExc_ValueError,
7200 "width too big");
7201 goto onError;
7202 }
7203 width = width*10 + (c - '0');
7204 }
7205 }
7206 if (c == '.') {
7207 prec = 0;
7208 if (--fmtcnt >= 0)
7209 c = *fmt++;
7210 if (c == '*') {
7211 v = getnextarg(args, arglen, &argidx);
7212 if (v == NULL)
7213 goto onError;
7214 if (!PyInt_Check(v)) {
7215 PyErr_SetString(PyExc_TypeError,
7216 "* wants int");
7217 goto onError;
7218 }
7219 prec = PyInt_AsLong(v);
7220 if (prec < 0)
7221 prec = 0;
7222 if (--fmtcnt >= 0)
7223 c = *fmt++;
7224 }
7225 else if (c >= '0' && c <= '9') {
7226 prec = c - '0';
7227 while (--fmtcnt >= 0) {
7228 c = Py_CHARMASK(*fmt++);
7229 if (c < '0' || c > '9')
7230 break;
7231 if ((prec*10) / 10 != prec) {
7232 PyErr_SetString(PyExc_ValueError,
7233 "prec too big");
7234 goto onError;
7235 }
7236 prec = prec*10 + (c - '0');
7237 }
7238 }
7239 } /* prec */
7240 if (fmtcnt >= 0) {
7241 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 if (--fmtcnt >= 0)
7243 c = *fmt++;
7244 }
7245 }
7246 if (fmtcnt < 0) {
7247 PyErr_SetString(PyExc_ValueError,
7248 "incomplete format");
7249 goto onError;
7250 }
7251 if (c != '%') {
7252 v = getnextarg(args, arglen, &argidx);
7253 if (v == NULL)
7254 goto onError;
7255 }
7256 sign = 0;
7257 fill = ' ';
7258 switch (c) {
7259
7260 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007261 pbuf = formatbuf;
7262 /* presume that buffer length is at least 1 */
7263 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 len = 1;
7265 break;
7266
7267 case 's':
7268 case 'r':
7269 if (PyUnicode_Check(v) && c == 's') {
7270 temp = v;
7271 Py_INCREF(temp);
7272 }
7273 else {
7274 PyObject *unicode;
7275 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007276 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 else
7278 temp = PyObject_Repr(v);
7279 if (temp == NULL)
7280 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007281 if (PyUnicode_Check(temp))
7282 /* nothing to do */;
7283 else if (PyString_Check(temp)) {
7284 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007285 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007287 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007289 Py_DECREF(temp);
7290 temp = unicode;
7291 if (temp == NULL)
7292 goto onError;
7293 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007294 else {
7295 Py_DECREF(temp);
7296 PyErr_SetString(PyExc_TypeError,
7297 "%s argument has non-string str()");
7298 goto onError;
7299 }
7300 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007301 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 len = PyUnicode_GET_SIZE(temp);
7303 if (prec >= 0 && len > prec)
7304 len = prec;
7305 break;
7306
7307 case 'i':
7308 case 'd':
7309 case 'u':
7310 case 'o':
7311 case 'x':
7312 case 'X':
7313 if (c == 'i')
7314 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007315 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007316 temp = formatlong(v, flags, prec, c);
7317 if (!temp)
7318 goto onError;
7319 pbuf = PyUnicode_AS_UNICODE(temp);
7320 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007321 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007323 else {
7324 pbuf = formatbuf;
7325 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7326 flags, prec, c, v);
7327 if (len < 0)
7328 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007329 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007330 }
7331 if (flags & F_ZERO)
7332 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 break;
7334
7335 case 'e':
7336 case 'E':
7337 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007338 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 case 'g':
7340 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007341 if (c == 'F')
7342 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007343 pbuf = formatbuf;
7344 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7345 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 if (len < 0)
7347 goto onError;
7348 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007349 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 fill = '0';
7351 break;
7352
7353 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007354 pbuf = formatbuf;
7355 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 if (len < 0)
7357 goto onError;
7358 break;
7359
7360 default:
7361 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007362 "unsupported format character '%c' (0x%x) "
7363 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007364 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007365 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007366 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367 goto onError;
7368 }
7369 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007370 if (*pbuf == '-' || *pbuf == '+') {
7371 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 len--;
7373 }
7374 else if (flags & F_SIGN)
7375 sign = '+';
7376 else if (flags & F_BLANK)
7377 sign = ' ';
7378 else
7379 sign = 0;
7380 }
7381 if (width < len)
7382 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007383 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 reslen -= rescnt;
7385 rescnt = width + fmtcnt + 100;
7386 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007387 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007388 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007389 PyErr_NoMemory();
7390 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007391 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007392 if (_PyUnicode_Resize(&result, reslen) < 0) {
7393 Py_XDECREF(temp);
7394 goto onError;
7395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 res = PyUnicode_AS_UNICODE(result)
7397 + reslen - rescnt;
7398 }
7399 if (sign) {
7400 if (fill != ' ')
7401 *res++ = sign;
7402 rescnt--;
7403 if (width > len)
7404 width--;
7405 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007406 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7407 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007408 assert(pbuf[1] == c);
7409 if (fill != ' ') {
7410 *res++ = *pbuf++;
7411 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007412 }
Tim Petersfff53252001-04-12 18:38:48 +00007413 rescnt -= 2;
7414 width -= 2;
7415 if (width < 0)
7416 width = 0;
7417 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 if (width > len && !(flags & F_LJUST)) {
7420 do {
7421 --rescnt;
7422 *res++ = fill;
7423 } while (--width > len);
7424 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007425 if (fill == ' ') {
7426 if (sign)
7427 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007428 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007429 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007430 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007431 *res++ = *pbuf++;
7432 *res++ = *pbuf++;
7433 }
7434 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007435 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 res += len;
7437 rescnt -= len;
7438 while (--width >= len) {
7439 --rescnt;
7440 *res++ = ' ';
7441 }
7442 if (dict && (argidx < arglen) && c != '%') {
7443 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007444 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007445 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 goto onError;
7447 }
7448 Py_XDECREF(temp);
7449 } /* '%' */
7450 } /* until end */
7451 if (argidx < arglen && !dict) {
7452 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007453 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 goto onError;
7455 }
7456
Thomas Woutersa96affe2006-03-12 00:29:36 +00007457 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 if (args_owned) {
7460 Py_DECREF(args);
7461 }
7462 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 return (PyObject *)result;
7464
7465 onError:
7466 Py_XDECREF(result);
7467 Py_DECREF(uformat);
7468 if (args_owned) {
7469 Py_DECREF(args);
7470 }
7471 return NULL;
7472}
7473
7474static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007475 (readbufferproc) unicode_buffer_getreadbuf,
7476 (writebufferproc) unicode_buffer_getwritebuf,
7477 (segcountproc) unicode_buffer_getsegcount,
7478 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479};
7480
Jeremy Hylton938ace62002-07-17 16:30:39 +00007481static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007482unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7483
Tim Peters6d6c1a32001-08-02 04:15:00 +00007484static PyObject *
7485unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7486{
7487 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007488 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007489 char *encoding = NULL;
7490 char *errors = NULL;
7491
Guido van Rossume023fe02001-08-30 03:12:59 +00007492 if (type != &PyUnicode_Type)
7493 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007494 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7495 kwlist, &x, &encoding, &errors))
7496 return NULL;
7497 if (x == NULL)
7498 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007499 if (encoding == NULL && errors == NULL)
7500 return PyObject_Unicode(x);
7501 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007502 return PyUnicode_FromEncodedObject(x, encoding, errors);
7503}
7504
Guido van Rossume023fe02001-08-30 03:12:59 +00007505static PyObject *
7506unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7507{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007508 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007509 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007510
7511 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7512 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7513 if (tmp == NULL)
7514 return NULL;
7515 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007516 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007517 if (pnew == NULL) {
7518 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007519 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007520 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007521 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7522 if (pnew->str == NULL) {
7523 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007524 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007525 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007526 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007527 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007528 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7529 pnew->length = n;
7530 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007531 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007532 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007533}
7534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007535PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007536"unicode(string [, encoding[, errors]]) -> object\n\
7537\n\
7538Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007539encoding defaults to the current default string encoding.\n\
7540errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007541
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542PyTypeObject PyUnicode_Type = {
7543 PyObject_HEAD_INIT(&PyType_Type)
7544 0, /* ob_size */
7545 "unicode", /* tp_name */
7546 sizeof(PyUnicodeObject), /* tp_size */
7547 0, /* tp_itemsize */
7548 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007549 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007551 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 0, /* tp_setattr */
7553 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007554 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007555 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007557 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 (hashfunc) unicode_hash, /* tp_hash*/
7559 0, /* tp_call*/
7560 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007561 PyObject_GenericGetAttr, /* tp_getattro */
7562 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007564 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7565 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007566 unicode_doc, /* tp_doc */
7567 0, /* tp_traverse */
7568 0, /* tp_clear */
7569 0, /* tp_richcompare */
7570 0, /* tp_weaklistoffset */
7571 0, /* tp_iter */
7572 0, /* tp_iternext */
7573 unicode_methods, /* tp_methods */
7574 0, /* tp_members */
7575 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007576 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007577 0, /* tp_dict */
7578 0, /* tp_descr_get */
7579 0, /* tp_descr_set */
7580 0, /* tp_dictoffset */
7581 0, /* tp_init */
7582 0, /* tp_alloc */
7583 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007584 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585};
7586
7587/* Initialize the Unicode implementation */
7588
Thomas Wouters78890102000-07-22 19:25:51 +00007589void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007591 int i;
7592
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007593 /* XXX - move this array to unicodectype.c ? */
7594 Py_UNICODE linebreak[] = {
7595 0x000A, /* LINE FEED */
7596 0x000D, /* CARRIAGE RETURN */
7597 0x001C, /* FILE SEPARATOR */
7598 0x001D, /* GROUP SEPARATOR */
7599 0x001E, /* RECORD SEPARATOR */
7600 0x0085, /* NEXT LINE */
7601 0x2028, /* LINE SEPARATOR */
7602 0x2029, /* PARAGRAPH SEPARATOR */
7603 };
7604
Fred Drakee4315f52000-05-09 19:53:39 +00007605 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007606 unicode_freelist = NULL;
7607 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007609 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007610 for (i = 0; i < 256; i++)
7611 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007612 if (PyType_Ready(&PyUnicode_Type) < 0)
7613 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007614
7615 /* initialize the linebreak bloom filter */
7616 bloom_linebreak = make_bloom_mask(
7617 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7618 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619}
7620
7621/* Finalize the Unicode implementation */
7622
7623void
Thomas Wouters78890102000-07-22 19:25:51 +00007624_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007626 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007627 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007629 Py_XDECREF(unicode_empty);
7630 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007631
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007632 for (i = 0; i < 256; i++) {
7633 if (unicode_latin1[i]) {
7634 Py_DECREF(unicode_latin1[i]);
7635 unicode_latin1[i] = NULL;
7636 }
7637 }
7638
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007639 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 PyUnicodeObject *v = u;
7641 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007642 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007643 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007644 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007645 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007647 unicode_freelist = NULL;
7648 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007650
Anthony Baxterac6bd462006-04-13 02:06:09 +00007651#ifdef __cplusplus
7652}
7653#endif
7654
7655
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007656/*
7657Local variables:
7658c-basic-offset: 4
7659indent-tabs-mode: nil
7660End:
7661*/