blob: d37ca0c3a532d41000dba62fd3cf3cc9475d79d6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundh95e2a912006-05-26 11:38:15 +0000144Py_LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundh95e2a912006-05-26 11:38:15 +0000158Py_LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000315 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000557 obj->ob_type->tp_name);
558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000800{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000808 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000809 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000846 goto onError;
847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000859 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
Tim Petersced69f82003-09-16 20:30:58 +0000880static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000904#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000906 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907 (encodeO && (utf7_special[(c)] == 3)))
908
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000916
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921 }
922
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000943 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 const char *errors)
945{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000971 Py_UNICODE ch;
972 restart:
973 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000987 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000988 }
989 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000990 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000996 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001001 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001006 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001032 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 }
1049
1050 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001059 if (s < e)
1060 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001064 goto onError;
1065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001068 return (PyObject *)unicode;
1069
1070onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001079 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001086 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001087 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001088 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001106 if (ch == '+') {
1107 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001114 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001136 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
Tim Petersced69f82003-09-16 20:30:58 +00001159 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001160 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001161 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
Tim Peters5de98422002-04-27 18:44:32 +00001167 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
Tim Petersced69f82003-09-16 20:30:58 +00001180static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001203 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 const char *errors)
1205{
Walter Dörwald69652032004-09-07 20:24:22 +00001206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001210 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001211 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001212 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
1244 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001245 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001252 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262
1263 switch (n) {
1264
1265 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 startinpos = s-starts;
1268 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001269 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270
1271 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001272 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276
1277 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001280 startinpos = s-starts;
1281 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001282 goto utf8Error;
1283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 break;
1294
1295 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001296 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001299 startinpos = s-starts;
1300 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001301 goto utf8Error;
1302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001306 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 startinpos = s-starts;
1314 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001315 goto utf8Error;
1316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001318 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 startinpos = s-starts;
1327 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001328 goto utf8Error;
1329 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001333 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001334 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001336 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001337 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001338 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001339 startinpos = s-starts;
1340 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001341 goto utf8Error;
1342 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001343#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001344 *p++ = (Py_UNICODE)ch;
1345#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001347
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001350
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001353
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001354 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001361 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362 startinpos = s-starts;
1363 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001364 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 }
1366 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001367 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001368
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 }
Walter Dörwald69652032004-09-07 20:24:22 +00001378 if (consumed)
1379 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
1381 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 goto onError;
1384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return (PyObject *)unicode;
1388
1389onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
Tim Peters602f7402002-04-27 18:03:26 +00001396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001403 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405{
Tim Peters602f7402002-04-27 18:03:26 +00001406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001407
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001411 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001412 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001413 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001414
Tim Peters602f7402002-04-27 18:03:26 +00001415 assert(s != NULL);
1416 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001437
Tim Peters602f7402002-04-27 18:03:26 +00001438 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001439 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001440
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001441 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001442 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001444
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001446 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001449 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001450 else {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001460 i++;
1461 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001462 }
Tim Peters602f7402002-04-27 18:03:26 +00001463 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001464 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001478
Tim Peters602f7402002-04-27 18:03:26 +00001479 if (v == NULL) {
1480 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001481 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001487 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001492
Tim Peters602f7402002-04-27 18:03:26 +00001493#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494}
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
Tim Peters772747b2001-08-09 22:21:55 +00001509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001511 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001512 const char *errors,
1513 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514{
Walter Dörwald69652032004-09-07 20:24:22 +00001515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001520 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001521 const char *errors,
1522 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001523 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001533 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001553 q = (unsigned char *)s;
1554 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555
1556 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001557 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
Tim Petersced69f82003-09-16 20:30:58 +00001575#else
Walter Dörwald69652032004-09-07 20:24:22 +00001576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001584#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001585 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587
Tim Peters772747b2001-08-09 22:21:55 +00001588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001600 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001601 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001602 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001603 if (consumed)
1604 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
Tim Peters772747b2001-08-09 22:21:55 +00001614 q += 2;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001622 if (q >= e) {
1623 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001626 goto utf16Error;
1627 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001628 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001632#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001633 *p++ = ch;
1634 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 goto utf16Error;
1645 }
1646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001648 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001651 /* Fall through to report the error */
1652
1653 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
Walter Dörwald69652032004-09-07 20:24:22 +00001666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671 goto onError;
1672
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681 return NULL;
1682}
1683
Tim Peters772747b2001-08-09 22:21:55 +00001684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001687 const char *errors,
1688 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689{
1690 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001691 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001692#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001693 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001694#else
1695 const int pairs = 0;
1696#endif
Tim Peters772747b2001-08-09 22:21:55 +00001697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001711#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001715#endif
Tim Petersced69f82003-09-16 20:30:58 +00001716 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001717 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718 if (v == NULL)
1719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720
Tim Peters772747b2001-08-09 22:21:55 +00001721 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001723 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001724 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001725 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001741#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001742 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#endif
Tim Peters772747b2001-08-09 22:21:55 +00001747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
Fredrik Lundh06d12682001-01-24 07:59:11 +00001769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001770
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001772 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 const char *errors)
1774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001775 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001779 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001787
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001799 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (s < end) {
1803 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001804 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001809 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 continue;
1811 }
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001834 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001836 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841 break;
1842
Fredrik Lundhccc74732001-02-18 22:13:49 +00001843 /* hex escapes */
1844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849
Fredrik Lundhccc74732001-02-18 22:13:49 +00001850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855
Fredrik Lundhccc74732001-02-18 22:13:49 +00001856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001874 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001875 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 *p++ = chr;
1908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001921 goto onError;
1922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001930 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001935 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001936 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001937 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001953 goto store;
1954 }
1955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001963 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001964 break;
1965
1966 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001967 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001983 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 nextByte:
1986 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001993
Fredrik Lundhccc74732001-02-18 22:13:49 +00001994ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001999 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002002 return NULL;
2003
Fredrik Lundhccc74732001-02-18 22:13:49 +00002004onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002018Py_LOCAL(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019 Py_ssize_t size,
2020 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002032
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002035 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002041 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2044 if (repr == NULL)
2045 return NULL;
2046
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002047 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048
2049 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002051 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 !findchar(s, size, '"')) ? '"' : '\'';
2053 }
2054 while (size-- > 0) {
2055 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002056
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002057 /* Escape quotes and backslashes */
2058 if ((quotes &&
2059 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 *p++ = '\\';
2061 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002062 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002065#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002066 /* Map 21-bit characters to '\U00xxxxxx' */
2067 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002068 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002069
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002070 /* Resize the string if necessary */
2071 if (offset + 12 > PyString_GET_SIZE(repr)) {
2072 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002073 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074 p = PyString_AS_STRING(repr) + offset;
2075 }
2076
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002077 *p++ = '\\';
2078 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2080 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2081 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2082 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2083 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2084 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2085 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002086 *p++ = hexdigit[ch & 0x0000000F];
2087 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002088 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002089#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002090 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091 else if (ch >= 0xD800 && ch < 0xDC00) {
2092 Py_UNICODE ch2;
2093 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002094
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002095 ch2 = *s++;
2096 size--;
2097 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2098 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2099 *p++ = '\\';
2100 *p++ = 'U';
2101 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2102 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2103 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2104 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2105 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2106 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2107 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2108 *p++ = hexdigit[ucs & 0x0000000F];
2109 continue;
2110 }
2111 /* Fall through: isolated surrogates are copied as-is */
2112 s--;
2113 size++;
2114 }
2115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002117 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 *p++ = '\\';
2119 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002120 *p++ = hexdigit[(ch >> 12) & 0x000F];
2121 *p++ = hexdigit[(ch >> 8) & 0x000F];
2122 *p++ = hexdigit[(ch >> 4) & 0x000F];
2123 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002125
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002126 /* Map special whitespace to '\t', \n', '\r' */
2127 else if (ch == '\t') {
2128 *p++ = '\\';
2129 *p++ = 't';
2130 }
2131 else if (ch == '\n') {
2132 *p++ = '\\';
2133 *p++ = 'n';
2134 }
2135 else if (ch == '\r') {
2136 *p++ = '\\';
2137 *p++ = 'r';
2138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002140 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002141 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002143 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002144 *p++ = hexdigit[(ch >> 4) & 0x000F];
2145 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002146 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 /* Copy everything else as-is */
2149 else
2150 *p++ = (char) ch;
2151 }
2152 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002153 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002154
2155 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002156 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 return repr;
2158}
2159
2160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002161 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162{
2163 return unicodeescape_string(s, size, 0);
2164}
2165
2166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2167{
2168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2171 }
2172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2173 PyUnicode_GET_SIZE(unicode));
2174}
2175
2176/* --- Raw Unicode Escape Codec ------------------------------------------- */
2177
2178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002179 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 const char *errors)
2181{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002182 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002183 Py_ssize_t startinpos;
2184 Py_ssize_t endinpos;
2185 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 const char *end;
2189 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002190 PyObject *errorHandler = NULL;
2191 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002192
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 /* Escaped strings will always be longer than the resulting
2194 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 length after conversion to the true value. (But decoding error
2196 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 v = _PyUnicode_New(size);
2198 if (v == NULL)
2199 goto onError;
2200 if (size == 0)
2201 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 end = s + size;
2204 while (s < end) {
2205 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002206 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002208 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209
2210 /* Non-escape characters are interpreted as Unicode ordinals */
2211 if (*s != '\\') {
2212 *p++ = (unsigned char)*s++;
2213 continue;
2214 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
2217 /* \u-escapes are only interpreted iff the number of leading
2218 backslashes if odd */
2219 bs = s;
2220 for (;s < end;) {
2221 if (*s != '\\')
2222 break;
2223 *p++ = (unsigned char)*s++;
2224 }
2225 if (((s - bs) & 1) == 0 ||
2226 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 continue;
2229 }
2230 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 s++;
2233
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002234 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002236 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002237 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002238 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002239 endinpos = s-starts;
2240 if (unicode_decode_call_errorhandler(
2241 errors, &errorHandler,
2242 "rawunicodeescape", "truncated \\uXXXX",
2243 starts, size, &startinpos, &endinpos, &exc, &s,
2244 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 }
2248 x = (x<<4) & ~0xF;
2249 if (c >= '0' && c <= '9')
2250 x += c - '0';
2251 else if (c >= 'a' && c <= 'f')
2252 x += 10 + c - 'a';
2253 else
2254 x += 10 + c - 'A';
2255 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002256#ifndef Py_UNICODE_WIDE
2257 if (x > 0x10000) {
2258 if (unicode_decode_call_errorhandler(
2259 errors, &errorHandler,
2260 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261 starts, size, &startinpos, &endinpos, &exc, &s,
2262 (PyObject **)&v, &outpos, &p))
2263 goto onError;
2264 }
2265#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 *p++ = x;
2267 nextByte:
2268 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002270 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 onError:
2277 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002278 Py_XDECREF(errorHandler);
2279 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 return NULL;
2281}
2282
2283PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002284 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
2286 PyObject *repr;
2287 char *p;
2288 char *q;
2289
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002290 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002292#ifdef Py_UNICODE_WIDE
2293 repr = PyString_FromStringAndSize(NULL, 10 * size);
2294#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002296#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 if (repr == NULL)
2298 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002299 if (size == 0)
2300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301
2302 p = q = PyString_AS_STRING(repr);
2303 while (size-- > 0) {
2304 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002305#ifdef Py_UNICODE_WIDE
2306 /* Map 32-bit characters to '\Uxxxxxxxx' */
2307 if (ch >= 0x10000) {
2308 *p++ = '\\';
2309 *p++ = 'U';
2310 *p++ = hexdigit[(ch >> 28) & 0xf];
2311 *p++ = hexdigit[(ch >> 24) & 0xf];
2312 *p++ = hexdigit[(ch >> 20) & 0xf];
2313 *p++ = hexdigit[(ch >> 16) & 0xf];
2314 *p++ = hexdigit[(ch >> 12) & 0xf];
2315 *p++ = hexdigit[(ch >> 8) & 0xf];
2316 *p++ = hexdigit[(ch >> 4) & 0xf];
2317 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002318 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002319 else
2320#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002321 /* Map 16-bit characters to '\uxxxx' */
2322 if (ch >= 256) {
2323 *p++ = '\\';
2324 *p++ = 'u';
2325 *p++ = hexdigit[(ch >> 12) & 0xf];
2326 *p++ = hexdigit[(ch >> 8) & 0xf];
2327 *p++ = hexdigit[(ch >> 4) & 0xf];
2328 *p++ = hexdigit[ch & 15];
2329 }
2330 /* Copy everything else as-is */
2331 else
2332 *p++ = (char) ch;
2333 }
2334 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002335 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 return repr;
2337}
2338
2339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2340{
2341 if (!PyUnicode_Check(unicode)) {
2342 PyErr_BadArgument();
2343 return NULL;
2344 }
2345 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2346 PyUnicode_GET_SIZE(unicode));
2347}
2348
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002349/* --- Unicode Internal Codec ------------------------------------------- */
2350
2351PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 const char *errors)
2354{
2355 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002356 Py_ssize_t startinpos;
2357 Py_ssize_t endinpos;
2358 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002359 PyUnicodeObject *v;
2360 Py_UNICODE *p;
2361 const char *end;
2362 const char *reason;
2363 PyObject *errorHandler = NULL;
2364 PyObject *exc = NULL;
2365
Neal Norwitzd43069c2006-01-08 01:12:10 +00002366#ifdef Py_UNICODE_WIDE
2367 Py_UNICODE unimax = PyUnicode_GetMax();
2368#endif
2369
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002370 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2371 if (v == NULL)
2372 goto onError;
2373 if (PyUnicode_GetSize((PyObject *)v) == 0)
2374 return (PyObject *)v;
2375 p = PyUnicode_AS_UNICODE(v);
2376 end = s + size;
2377
2378 while (s < end) {
Martin v. Löwisd004fc82006-05-27 08:36:52 +00002379 *p = *(Py_UNICODE*)s;
Neal Norwitz1004a532006-05-15 07:17:23 +00002380 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002381 /* We have to sanity check the raw data, otherwise doom looms for
2382 some malformed UCS-4 data. */
2383 if (
2384 #ifdef Py_UNICODE_WIDE
2385 *p > unimax || *p < 0 ||
2386 #endif
2387 end-s < Py_UNICODE_SIZE
2388 )
2389 {
2390 startinpos = s - starts;
2391 if (end-s < Py_UNICODE_SIZE) {
2392 endinpos = end-starts;
2393 reason = "truncated input";
2394 }
2395 else {
2396 endinpos = s - starts + Py_UNICODE_SIZE;
2397 reason = "illegal code point (> 0x10FFFF)";
2398 }
2399 outpos = p - PyUnicode_AS_UNICODE(v);
2400 if (unicode_decode_call_errorhandler(
2401 errors, &errorHandler,
2402 "unicode_internal", reason,
2403 starts, size, &startinpos, &endinpos, &exc, &s,
2404 (PyObject **)&v, &outpos, &p)) {
2405 goto onError;
2406 }
2407 }
2408 else {
2409 p++;
2410 s += Py_UNICODE_SIZE;
2411 }
2412 }
2413
Martin v. Löwis412fb672006-04-13 06:34:32 +00002414 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002415 goto onError;
2416 Py_XDECREF(errorHandler);
2417 Py_XDECREF(exc);
2418 return (PyObject *)v;
2419
2420 onError:
2421 Py_XDECREF(v);
2422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
2424 return NULL;
2425}
2426
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427/* --- Latin-1 Codec ------------------------------------------------------ */
2428
2429PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002430 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431 const char *errors)
2432{
2433 PyUnicodeObject *v;
2434 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002435
Guido van Rossumd57fd912000-03-10 22:53:23 +00002436 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002437 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002438 Py_UNICODE r = *(unsigned char*)s;
2439 return PyUnicode_FromUnicode(&r, 1);
2440 }
2441
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 v = _PyUnicode_New(size);
2443 if (v == NULL)
2444 goto onError;
2445 if (size == 0)
2446 return (PyObject *)v;
2447 p = PyUnicode_AS_UNICODE(v);
2448 while (size-- > 0)
2449 *p++ = (unsigned char)*s++;
2450 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452 onError:
2453 Py_XDECREF(v);
2454 return NULL;
2455}
2456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457/* create or adjust a UnicodeEncodeError */
2458static void make_encode_exception(PyObject **exceptionObject,
2459 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002460 const Py_UNICODE *unicode, Py_ssize_t size,
2461 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002464 if (*exceptionObject == NULL) {
2465 *exceptionObject = PyUnicodeEncodeError_Create(
2466 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 }
2468 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002469 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2470 goto onError;
2471 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2472 goto onError;
2473 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2474 goto onError;
2475 return;
2476 onError:
2477 Py_DECREF(*exceptionObject);
2478 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479 }
2480}
2481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002482/* raises a UnicodeEncodeError */
2483static void raise_encode_exception(PyObject **exceptionObject,
2484 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002485 const Py_UNICODE *unicode, Py_ssize_t size,
2486 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002487 const char *reason)
2488{
2489 make_encode_exception(exceptionObject,
2490 encoding, unicode, size, startpos, endpos, reason);
2491 if (*exceptionObject != NULL)
2492 PyCodec_StrictErrors(*exceptionObject);
2493}
2494
2495/* error handling callback helper:
2496 build arguments, call the callback and check the arguments,
2497 put the result into newpos and return the replacement string, which
2498 has to be freed by the caller */
2499static PyObject *unicode_encode_call_errorhandler(const char *errors,
2500 PyObject **errorHandler,
2501 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002502 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2503 Py_ssize_t startpos, Py_ssize_t endpos,
2504 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002505{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002506 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507
2508 PyObject *restuple;
2509 PyObject *resunicode;
2510
2511 if (*errorHandler == NULL) {
2512 *errorHandler = PyCodec_LookupError(errors);
2513 if (*errorHandler == NULL)
2514 return NULL;
2515 }
2516
2517 make_encode_exception(exceptionObject,
2518 encoding, unicode, size, startpos, endpos, reason);
2519 if (*exceptionObject == NULL)
2520 return NULL;
2521
2522 restuple = PyObject_CallFunctionObjArgs(
2523 *errorHandler, *exceptionObject, NULL);
2524 if (restuple == NULL)
2525 return NULL;
2526 if (!PyTuple_Check(restuple)) {
2527 PyErr_Format(PyExc_TypeError, &argparse[4]);
2528 Py_DECREF(restuple);
2529 return NULL;
2530 }
2531 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2532 &resunicode, newpos)) {
2533 Py_DECREF(restuple);
2534 return NULL;
2535 }
2536 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002537 *newpos = size+*newpos;
2538 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002539 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002540 Py_DECREF(restuple);
2541 return NULL;
2542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 Py_INCREF(resunicode);
2544 Py_DECREF(restuple);
2545 return resunicode;
2546}
2547
2548static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002549 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002550 const char *errors,
2551 int limit)
2552{
2553 /* output object */
2554 PyObject *res;
2555 /* pointers to the beginning and end+1 of input */
2556 const Py_UNICODE *startp = p;
2557 const Py_UNICODE *endp = p + size;
2558 /* pointer to the beginning of the unencodable characters */
2559 /* const Py_UNICODE *badp = NULL; */
2560 /* pointer into the output */
2561 char *str;
2562 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002563 Py_ssize_t respos = 0;
2564 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002565 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2566 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002567 PyObject *errorHandler = NULL;
2568 PyObject *exc = NULL;
2569 /* the following variable is used for caching string comparisons
2570 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2571 int known_errorHandler = -1;
2572
2573 /* allocate enough for a simple encoding without
2574 replacements, if we need more, we'll resize */
2575 res = PyString_FromStringAndSize(NULL, size);
2576 if (res == NULL)
2577 goto onError;
2578 if (size == 0)
2579 return res;
2580 str = PyString_AS_STRING(res);
2581 ressize = size;
2582
2583 while (p<endp) {
2584 Py_UNICODE c = *p;
2585
2586 /* can we encode this? */
2587 if (c<limit) {
2588 /* no overflow check, because we know that the space is enough */
2589 *str++ = (char)c;
2590 ++p;
2591 }
2592 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002593 Py_ssize_t unicodepos = p-startp;
2594 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002596 Py_ssize_t repsize;
2597 Py_ssize_t newpos;
2598 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002599 Py_UNICODE *uni2;
2600 /* startpos for collecting unencodable chars */
2601 const Py_UNICODE *collstart = p;
2602 const Py_UNICODE *collend = p;
2603 /* find all unecodable characters */
2604 while ((collend < endp) && ((*collend)>=limit))
2605 ++collend;
2606 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2607 if (known_errorHandler==-1) {
2608 if ((errors==NULL) || (!strcmp(errors, "strict")))
2609 known_errorHandler = 1;
2610 else if (!strcmp(errors, "replace"))
2611 known_errorHandler = 2;
2612 else if (!strcmp(errors, "ignore"))
2613 known_errorHandler = 3;
2614 else if (!strcmp(errors, "xmlcharrefreplace"))
2615 known_errorHandler = 4;
2616 else
2617 known_errorHandler = 0;
2618 }
2619 switch (known_errorHandler) {
2620 case 1: /* strict */
2621 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2622 goto onError;
2623 case 2: /* replace */
2624 while (collstart++<collend)
2625 *str++ = '?'; /* fall through */
2626 case 3: /* ignore */
2627 p = collend;
2628 break;
2629 case 4: /* xmlcharrefreplace */
2630 respos = str-PyString_AS_STRING(res);
2631 /* determine replacement size (temporarily (mis)uses p) */
2632 for (p = collstart, repsize = 0; p < collend; ++p) {
2633 if (*p<10)
2634 repsize += 2+1+1;
2635 else if (*p<100)
2636 repsize += 2+2+1;
2637 else if (*p<1000)
2638 repsize += 2+3+1;
2639 else if (*p<10000)
2640 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002641#ifndef Py_UNICODE_WIDE
2642 else
2643 repsize += 2+5+1;
2644#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 else if (*p<100000)
2646 repsize += 2+5+1;
2647 else if (*p<1000000)
2648 repsize += 2+6+1;
2649 else
2650 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002651#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002652 }
2653 requiredsize = respos+repsize+(endp-collend);
2654 if (requiredsize > ressize) {
2655 if (requiredsize<2*ressize)
2656 requiredsize = 2*ressize;
2657 if (_PyString_Resize(&res, requiredsize))
2658 goto onError;
2659 str = PyString_AS_STRING(res) + respos;
2660 ressize = requiredsize;
2661 }
2662 /* generate replacement (temporarily (mis)uses p) */
2663 for (p = collstart; p < collend; ++p) {
2664 str += sprintf(str, "&#%d;", (int)*p);
2665 }
2666 p = collend;
2667 break;
2668 default:
2669 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2670 encoding, reason, startp, size, &exc,
2671 collstart-startp, collend-startp, &newpos);
2672 if (repunicode == NULL)
2673 goto onError;
2674 /* need more space? (at least enough for what we
2675 have+the replacement+the rest of the string, so
2676 we won't have to check space for encodable characters) */
2677 respos = str-PyString_AS_STRING(res);
2678 repsize = PyUnicode_GET_SIZE(repunicode);
2679 requiredsize = respos+repsize+(endp-collend);
2680 if (requiredsize > ressize) {
2681 if (requiredsize<2*ressize)
2682 requiredsize = 2*ressize;
2683 if (_PyString_Resize(&res, requiredsize)) {
2684 Py_DECREF(repunicode);
2685 goto onError;
2686 }
2687 str = PyString_AS_STRING(res) + respos;
2688 ressize = requiredsize;
2689 }
2690 /* check if there is anything unencodable in the replacement
2691 and copy it to the output */
2692 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2693 c = *uni2;
2694 if (c >= limit) {
2695 raise_encode_exception(&exc, encoding, startp, size,
2696 unicodepos, unicodepos+1, reason);
2697 Py_DECREF(repunicode);
2698 goto onError;
2699 }
2700 *str = (char)c;
2701 }
2702 p = startp + newpos;
2703 Py_DECREF(repunicode);
2704 }
2705 }
2706 }
2707 /* Resize if we allocated to much */
2708 respos = str-PyString_AS_STRING(res);
2709 if (respos<ressize)
2710 /* If this falls res will be NULL */
2711 _PyString_Resize(&res, respos);
2712 Py_XDECREF(errorHandler);
2713 Py_XDECREF(exc);
2714 return res;
2715
2716 onError:
2717 Py_XDECREF(res);
2718 Py_XDECREF(errorHandler);
2719 Py_XDECREF(exc);
2720 return NULL;
2721}
2722
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002724 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 const char *errors)
2726{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728}
2729
2730PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2731{
2732 if (!PyUnicode_Check(unicode)) {
2733 PyErr_BadArgument();
2734 return NULL;
2735 }
2736 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2737 PyUnicode_GET_SIZE(unicode),
2738 NULL);
2739}
2740
2741/* --- 7-bit ASCII Codec -------------------------------------------------- */
2742
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002744 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 const char *errors)
2746{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 PyUnicodeObject *v;
2749 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002750 Py_ssize_t startinpos;
2751 Py_ssize_t endinpos;
2752 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 const char *e;
2754 PyObject *errorHandler = NULL;
2755 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002756
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002758 if (size == 1 && *(unsigned char*)s < 128) {
2759 Py_UNICODE r = *(unsigned char*)s;
2760 return PyUnicode_FromUnicode(&r, 1);
2761 }
Tim Petersced69f82003-09-16 20:30:58 +00002762
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 v = _PyUnicode_New(size);
2764 if (v == NULL)
2765 goto onError;
2766 if (size == 0)
2767 return (PyObject *)v;
2768 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 e = s + size;
2770 while (s < e) {
2771 register unsigned char c = (unsigned char)*s;
2772 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002774 ++s;
2775 }
2776 else {
2777 startinpos = s-starts;
2778 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002779 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002780 if (unicode_decode_call_errorhandler(
2781 errors, &errorHandler,
2782 "ascii", "ordinal not in range(128)",
2783 starts, size, &startinpos, &endinpos, &exc, &s,
2784 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002788 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002789 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002790 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 Py_XDECREF(errorHandler);
2792 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002794
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 onError:
2796 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002797 Py_XDECREF(errorHandler);
2798 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 return NULL;
2800}
2801
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002803 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 const char *errors)
2805{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002806 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807}
2808
2809PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2810{
2811 if (!PyUnicode_Check(unicode)) {
2812 PyErr_BadArgument();
2813 return NULL;
2814 }
2815 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2816 PyUnicode_GET_SIZE(unicode),
2817 NULL);
2818}
2819
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002820#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002821
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002822/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002823
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002824PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002825 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002826 const char *errors)
2827{
2828 PyUnicodeObject *v;
2829 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002830 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002831
2832 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002833 assert(size < INT_MAX);
2834 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002835 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002836 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2837
2838 v = _PyUnicode_New(usize);
2839 if (v == NULL)
2840 return NULL;
2841 if (usize == 0)
2842 return (PyObject *)v;
2843 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002844 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002845 Py_DECREF(v);
2846 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2847 }
2848
2849 return (PyObject *)v;
2850}
2851
2852PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002853 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002854 const char *errors)
2855{
2856 PyObject *repr;
2857 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002858 DWORD mbcssize;
2859
2860 /* If there are no characters, bail now! */
2861 if (size==0)
2862 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002863
2864 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002865 assert(size<INT_MAX);
2866 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002867 if (mbcssize==0)
2868 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2869
2870 repr = PyString_FromStringAndSize(NULL, mbcssize);
2871 if (repr == NULL)
2872 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002873 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002874 return repr;
2875
2876 /* Do the conversion */
2877 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002878 assert(size < INT_MAX);
2879 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002880 Py_DECREF(repr);
2881 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2882 }
2883 return repr;
2884}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002885
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002886PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2887{
2888 if (!PyUnicode_Check(unicode)) {
2889 PyErr_BadArgument();
2890 return NULL;
2891 }
2892 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2893 PyUnicode_GET_SIZE(unicode),
2894 NULL);
2895}
2896
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002897#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002898
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899/* --- Character Mapping Codec -------------------------------------------- */
2900
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002902 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 PyObject *mapping,
2904 const char *errors)
2905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002907 Py_ssize_t startinpos;
2908 Py_ssize_t endinpos;
2909 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 PyUnicodeObject *v;
2912 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002913 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002914 PyObject *errorHandler = NULL;
2915 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002916 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002917 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002918
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 /* Default to Latin-1 */
2920 if (mapping == NULL)
2921 return PyUnicode_DecodeLatin1(s, size, errors);
2922
2923 v = _PyUnicode_New(size);
2924 if (v == NULL)
2925 goto onError;
2926 if (size == 0)
2927 return (PyObject *)v;
2928 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002930 if (PyUnicode_CheckExact(mapping)) {
2931 mapstring = PyUnicode_AS_UNICODE(mapping);
2932 maplen = PyUnicode_GET_SIZE(mapping);
2933 while (s < e) {
2934 unsigned char ch = *s;
2935 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002937 if (ch < maplen)
2938 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002940 if (x == 0xfffe) {
2941 /* undefined mapping */
2942 outpos = p-PyUnicode_AS_UNICODE(v);
2943 startinpos = s-starts;
2944 endinpos = startinpos+1;
2945 if (unicode_decode_call_errorhandler(
2946 errors, &errorHandler,
2947 "charmap", "character maps to <undefined>",
2948 starts, size, &startinpos, &endinpos, &exc, &s,
2949 (PyObject **)&v, &outpos, &p)) {
2950 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002951 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002952 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002953 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002954 *p++ = x;
2955 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002957 }
2958 else {
2959 while (s < e) {
2960 unsigned char ch = *s;
2961 PyObject *w, *x;
2962
2963 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2964 w = PyInt_FromLong((long)ch);
2965 if (w == NULL)
2966 goto onError;
2967 x = PyObject_GetItem(mapping, w);
2968 Py_DECREF(w);
2969 if (x == NULL) {
2970 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2971 /* No mapping found means: mapping is undefined. */
2972 PyErr_Clear();
2973 x = Py_None;
2974 Py_INCREF(x);
2975 } else
2976 goto onError;
2977 }
2978
2979 /* Apply mapping */
2980 if (PyInt_Check(x)) {
2981 long value = PyInt_AS_LONG(x);
2982 if (value < 0 || value > 65535) {
2983 PyErr_SetString(PyExc_TypeError,
2984 "character mapping must be in range(65536)");
2985 Py_DECREF(x);
2986 goto onError;
2987 }
2988 *p++ = (Py_UNICODE)value;
2989 }
2990 else if (x == Py_None) {
2991 /* undefined mapping */
2992 outpos = p-PyUnicode_AS_UNICODE(v);
2993 startinpos = s-starts;
2994 endinpos = startinpos+1;
2995 if (unicode_decode_call_errorhandler(
2996 errors, &errorHandler,
2997 "charmap", "character maps to <undefined>",
2998 starts, size, &startinpos, &endinpos, &exc, &s,
2999 (PyObject **)&v, &outpos, &p)) {
3000 Py_DECREF(x);
3001 goto onError;
3002 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003003 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003004 continue;
3005 }
3006 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003007 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003008
3009 if (targetsize == 1)
3010 /* 1-1 mapping */
3011 *p++ = *PyUnicode_AS_UNICODE(x);
3012
3013 else if (targetsize > 1) {
3014 /* 1-n mapping */
3015 if (targetsize > extrachars) {
3016 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003017 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3018 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003019 (targetsize << 2);
3020 extrachars += needed;
3021 if (_PyUnicode_Resize(&v,
3022 PyUnicode_GET_SIZE(v) + needed) < 0) {
3023 Py_DECREF(x);
3024 goto onError;
3025 }
3026 p = PyUnicode_AS_UNICODE(v) + oldpos;
3027 }
3028 Py_UNICODE_COPY(p,
3029 PyUnicode_AS_UNICODE(x),
3030 targetsize);
3031 p += targetsize;
3032 extrachars -= targetsize;
3033 }
3034 /* 1-0 mapping: skip the character */
3035 }
3036 else {
3037 /* wrong return value */
3038 PyErr_SetString(PyExc_TypeError,
3039 "character mapping must return integer, None or unicode");
3040 Py_DECREF(x);
3041 goto onError;
3042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003044 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 }
3047 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003048 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050 Py_XDECREF(errorHandler);
3051 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003053
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003055 Py_XDECREF(errorHandler);
3056 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 Py_XDECREF(v);
3058 return NULL;
3059}
3060
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061/* Lookup the character ch in the mapping. If the character
3062 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003063 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003066 PyObject *w = PyInt_FromLong((long)c);
3067 PyObject *x;
3068
3069 if (w == NULL)
3070 return NULL;
3071 x = PyObject_GetItem(mapping, w);
3072 Py_DECREF(w);
3073 if (x == NULL) {
3074 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3075 /* No mapping found means: mapping is undefined. */
3076 PyErr_Clear();
3077 x = Py_None;
3078 Py_INCREF(x);
3079 return x;
3080 } else
3081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003083 else if (x == Py_None)
3084 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003085 else if (PyInt_Check(x)) {
3086 long value = PyInt_AS_LONG(x);
3087 if (value < 0 || value > 255) {
3088 PyErr_SetString(PyExc_TypeError,
3089 "character mapping must be in range(256)");
3090 Py_DECREF(x);
3091 return NULL;
3092 }
3093 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095 else if (PyString_Check(x))
3096 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 /* wrong return value */
3099 PyErr_SetString(PyExc_TypeError,
3100 "character mapping must return integer, None or str");
3101 Py_DECREF(x);
3102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
3104}
3105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106/* lookup the character, put the result in the output string and adjust
3107 various state variables. Reallocate the output string if not enough
3108 space is available. Return a new reference to the object that
3109 was put in the output buffer, or Py_None, if the mapping was undefined
3110 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003111 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112static
3113PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003114 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003115{
3116 PyObject *rep = charmapencode_lookup(c, mapping);
3117
3118 if (rep==NULL)
3119 return NULL;
3120 else if (rep==Py_None)
3121 return rep;
3122 else {
3123 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003124 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003125 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003126 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003127 if (outsize<requiredsize) {
3128 /* exponentially overallocate to minimize reallocations */
3129 if (requiredsize < 2*outsize)
3130 requiredsize = 2*outsize;
3131 if (_PyString_Resize(outobj, requiredsize)) {
3132 Py_DECREF(rep);
3133 return NULL;
3134 }
3135 outstart = PyString_AS_STRING(*outobj);
3136 }
3137 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3138 }
3139 else {
3140 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003141 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3142 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003143 if (outsize<requiredsize) {
3144 /* exponentially overallocate to minimize reallocations */
3145 if (requiredsize < 2*outsize)
3146 requiredsize = 2*outsize;
3147 if (_PyString_Resize(outobj, requiredsize)) {
3148 Py_DECREF(rep);
3149 return NULL;
3150 }
3151 outstart = PyString_AS_STRING(*outobj);
3152 }
3153 memcpy(outstart + *outpos, repchars, repsize);
3154 *outpos += repsize;
3155 }
3156 }
3157 return rep;
3158}
3159
3160/* handle an error in PyUnicode_EncodeCharmap
3161 Return 0 on success, -1 on error */
3162static
3163int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003164 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003166 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003167 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168{
3169 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003170 Py_ssize_t repsize;
3171 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003172 Py_UNICODE *uni2;
3173 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003174 Py_ssize_t collstartpos = *inpos;
3175 Py_ssize_t collendpos = *inpos+1;
3176 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 char *encoding = "charmap";
3178 char *reason = "character maps to <undefined>";
3179
3180 PyObject *x;
3181 /* find all unencodable characters */
3182 while (collendpos < size) {
3183 x = charmapencode_lookup(p[collendpos], mapping);
3184 if (x==NULL)
3185 return -1;
3186 else if (x!=Py_None) {
3187 Py_DECREF(x);
3188 break;
3189 }
3190 Py_DECREF(x);
3191 ++collendpos;
3192 }
3193 /* cache callback name lookup
3194 * (if not done yet, i.e. it's the first error) */
3195 if (*known_errorHandler==-1) {
3196 if ((errors==NULL) || (!strcmp(errors, "strict")))
3197 *known_errorHandler = 1;
3198 else if (!strcmp(errors, "replace"))
3199 *known_errorHandler = 2;
3200 else if (!strcmp(errors, "ignore"))
3201 *known_errorHandler = 3;
3202 else if (!strcmp(errors, "xmlcharrefreplace"))
3203 *known_errorHandler = 4;
3204 else
3205 *known_errorHandler = 0;
3206 }
3207 switch (*known_errorHandler) {
3208 case 1: /* strict */
3209 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3210 return -1;
3211 case 2: /* replace */
3212 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3213 x = charmapencode_output('?', mapping, res, respos);
3214 if (x==NULL) {
3215 return -1;
3216 }
3217 else if (x==Py_None) {
3218 Py_DECREF(x);
3219 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3220 return -1;
3221 }
3222 Py_DECREF(x);
3223 }
3224 /* fall through */
3225 case 3: /* ignore */
3226 *inpos = collendpos;
3227 break;
3228 case 4: /* xmlcharrefreplace */
3229 /* generate replacement (temporarily (mis)uses p) */
3230 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3231 char buffer[2+29+1+1];
3232 char *cp;
3233 sprintf(buffer, "&#%d;", (int)p[collpos]);
3234 for (cp = buffer; *cp; ++cp) {
3235 x = charmapencode_output(*cp, mapping, res, respos);
3236 if (x==NULL)
3237 return -1;
3238 else if (x==Py_None) {
3239 Py_DECREF(x);
3240 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3241 return -1;
3242 }
3243 Py_DECREF(x);
3244 }
3245 }
3246 *inpos = collendpos;
3247 break;
3248 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003249 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 encoding, reason, p, size, exceptionObject,
3251 collstartpos, collendpos, &newpos);
3252 if (repunicode == NULL)
3253 return -1;
3254 /* generate replacement */
3255 repsize = PyUnicode_GET_SIZE(repunicode);
3256 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3257 x = charmapencode_output(*uni2, mapping, res, respos);
3258 if (x==NULL) {
3259 Py_DECREF(repunicode);
3260 return -1;
3261 }
3262 else if (x==Py_None) {
3263 Py_DECREF(repunicode);
3264 Py_DECREF(x);
3265 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3266 return -1;
3267 }
3268 Py_DECREF(x);
3269 }
3270 *inpos = newpos;
3271 Py_DECREF(repunicode);
3272 }
3273 return 0;
3274}
3275
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 PyObject *mapping,
3279 const char *errors)
3280{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003281 /* output object */
3282 PyObject *res = NULL;
3283 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003284 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003286 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 PyObject *errorHandler = NULL;
3288 PyObject *exc = NULL;
3289 /* the following variable is used for caching string comparisons
3290 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3291 * 3=ignore, 4=xmlcharrefreplace */
3292 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293
3294 /* Default to Latin-1 */
3295 if (mapping == NULL)
3296 return PyUnicode_EncodeLatin1(p, size, errors);
3297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 /* allocate enough for a simple encoding without
3299 replacements, if we need more, we'll resize */
3300 res = PyString_FromStringAndSize(NULL, size);
3301 if (res == NULL)
3302 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003303 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 while (inpos<size) {
3307 /* try to encode it */
3308 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3309 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 if (x==Py_None) { /* unencodable character */
3312 if (charmap_encoding_error(p, size, &inpos, mapping,
3313 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003314 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003315 &res, &respos)) {
3316 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003317 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003320 else
3321 /* done with this character => adjust input position */
3322 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 Py_DECREF(x);
3324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 /* Resize if we allocated to much */
3327 if (respos<PyString_GET_SIZE(res)) {
3328 if (_PyString_Resize(&res, respos))
3329 goto onError;
3330 }
3331 Py_XDECREF(exc);
3332 Py_XDECREF(errorHandler);
3333 return res;
3334
3335 onError:
3336 Py_XDECREF(res);
3337 Py_XDECREF(exc);
3338 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 return NULL;
3340}
3341
3342PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3343 PyObject *mapping)
3344{
3345 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3346 PyErr_BadArgument();
3347 return NULL;
3348 }
3349 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3350 PyUnicode_GET_SIZE(unicode),
3351 mapping,
3352 NULL);
3353}
3354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355/* create or adjust a UnicodeTranslateError */
3356static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003357 const Py_UNICODE *unicode, Py_ssize_t size,
3358 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361 if (*exceptionObject == NULL) {
3362 *exceptionObject = PyUnicodeTranslateError_Create(
3363 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 }
3365 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3367 goto onError;
3368 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3369 goto onError;
3370 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3371 goto onError;
3372 return;
3373 onError:
3374 Py_DECREF(*exceptionObject);
3375 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
3377}
3378
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379/* raises a UnicodeTranslateError */
3380static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003381 const Py_UNICODE *unicode, Py_ssize_t size,
3382 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383 const char *reason)
3384{
3385 make_translate_exception(exceptionObject,
3386 unicode, size, startpos, endpos, reason);
3387 if (*exceptionObject != NULL)
3388 PyCodec_StrictErrors(*exceptionObject);
3389}
3390
3391/* error handling callback helper:
3392 build arguments, call the callback and check the arguments,
3393 put the result into newpos and return the replacement string, which
3394 has to be freed by the caller */
3395static PyObject *unicode_translate_call_errorhandler(const char *errors,
3396 PyObject **errorHandler,
3397 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003398 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3399 Py_ssize_t startpos, Py_ssize_t endpos,
3400 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003402 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403
Martin v. Löwis412fb672006-04-13 06:34:32 +00003404 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405 PyObject *restuple;
3406 PyObject *resunicode;
3407
3408 if (*errorHandler == NULL) {
3409 *errorHandler = PyCodec_LookupError(errors);
3410 if (*errorHandler == NULL)
3411 return NULL;
3412 }
3413
3414 make_translate_exception(exceptionObject,
3415 unicode, size, startpos, endpos, reason);
3416 if (*exceptionObject == NULL)
3417 return NULL;
3418
3419 restuple = PyObject_CallFunctionObjArgs(
3420 *errorHandler, *exceptionObject, NULL);
3421 if (restuple == NULL)
3422 return NULL;
3423 if (!PyTuple_Check(restuple)) {
3424 PyErr_Format(PyExc_TypeError, &argparse[4]);
3425 Py_DECREF(restuple);
3426 return NULL;
3427 }
3428 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003429 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 Py_DECREF(restuple);
3431 return NULL;
3432 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003433 if (i_newpos<0)
3434 *newpos = size+i_newpos;
3435 else
3436 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003437 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003438 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003439 Py_DECREF(restuple);
3440 return NULL;
3441 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 Py_INCREF(resunicode);
3443 Py_DECREF(restuple);
3444 return resunicode;
3445}
3446
3447/* Lookup the character ch in the mapping and put the result in result,
3448 which must be decrefed by the caller.
3449 Return 0 on success, -1 on error */
3450static
3451int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3452{
3453 PyObject *w = PyInt_FromLong((long)c);
3454 PyObject *x;
3455
3456 if (w == NULL)
3457 return -1;
3458 x = PyObject_GetItem(mapping, w);
3459 Py_DECREF(w);
3460 if (x == NULL) {
3461 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3462 /* No mapping found means: use 1:1 mapping. */
3463 PyErr_Clear();
3464 *result = NULL;
3465 return 0;
3466 } else
3467 return -1;
3468 }
3469 else if (x == Py_None) {
3470 *result = x;
3471 return 0;
3472 }
3473 else if (PyInt_Check(x)) {
3474 long value = PyInt_AS_LONG(x);
3475 long max = PyUnicode_GetMax();
3476 if (value < 0 || value > max) {
3477 PyErr_Format(PyExc_TypeError,
3478 "character mapping must be in range(0x%lx)", max+1);
3479 Py_DECREF(x);
3480 return -1;
3481 }
3482 *result = x;
3483 return 0;
3484 }
3485 else if (PyUnicode_Check(x)) {
3486 *result = x;
3487 return 0;
3488 }
3489 else {
3490 /* wrong return value */
3491 PyErr_SetString(PyExc_TypeError,
3492 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003493 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 return -1;
3495 }
3496}
3497/* ensure that *outobj is at least requiredsize characters long,
3498if not reallocate and adjust various state variables.
3499Return 0 on success, -1 on error */
3500static
Walter Dörwald4894c302003-10-24 14:25:28 +00003501int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003502 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003504 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003505 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003507 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003509 if (requiredsize < 2 * oldsize)
3510 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003511 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 return -1;
3513 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 }
3515 return 0;
3516}
3517/* lookup the character, put the result in the output string and adjust
3518 various state variables. Return a new reference to the object that
3519 was put in the output buffer in *result, or Py_None, if the mapping was
3520 undefined (in which case no character was written).
3521 The called must decref result.
3522 Return 0 on success, -1 on error. */
3523static
Walter Dörwald4894c302003-10-24 14:25:28 +00003524int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003525 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003526 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527{
Walter Dörwald4894c302003-10-24 14:25:28 +00003528 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 return -1;
3530 if (*res==NULL) {
3531 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003532 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 }
3534 else if (*res==Py_None)
3535 ;
3536 else if (PyInt_Check(*res)) {
3537 /* no overflow check, because we know that the space is enough */
3538 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3539 }
3540 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003541 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (repsize==1) {
3543 /* no overflow check, because we know that the space is enough */
3544 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3545 }
3546 else if (repsize!=0) {
3547 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003548 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003549 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003550 repsize - 1;
3551 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 return -1;
3553 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3554 *outp += repsize;
3555 }
3556 }
3557 else
3558 return -1;
3559 return 0;
3560}
3561
3562PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003563 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 PyObject *mapping,
3565 const char *errors)
3566{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 /* output object */
3568 PyObject *res = NULL;
3569 /* pointers to the beginning and end+1 of input */
3570 const Py_UNICODE *startp = p;
3571 const Py_UNICODE *endp = p + size;
3572 /* pointer into the output */
3573 Py_UNICODE *str;
3574 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 char *reason = "character maps to <undefined>";
3577 PyObject *errorHandler = NULL;
3578 PyObject *exc = NULL;
3579 /* the following variable is used for caching string comparisons
3580 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3581 * 3=ignore, 4=xmlcharrefreplace */
3582 int known_errorHandler = -1;
3583
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 if (mapping == NULL) {
3585 PyErr_BadArgument();
3586 return NULL;
3587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588
3589 /* allocate enough for a simple 1:1 translation without
3590 replacements, if we need more, we'll resize */
3591 res = PyUnicode_FromUnicode(NULL, size);
3592 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003593 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 return res;
3596 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 while (p<endp) {
3599 /* try to encode it */
3600 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003601 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 goto onError;
3604 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003605 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 if (x!=Py_None) /* it worked => adjust input pointer */
3607 ++p;
3608 else { /* untranslatable character */
3609 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 Py_ssize_t repsize;
3611 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 Py_UNICODE *uni2;
3613 /* startpos for collecting untranslatable chars */
3614 const Py_UNICODE *collstart = p;
3615 const Py_UNICODE *collend = p+1;
3616 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 /* find all untranslatable characters */
3619 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003620 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 goto onError;
3622 Py_XDECREF(x);
3623 if (x!=Py_None)
3624 break;
3625 ++collend;
3626 }
3627 /* cache callback name lookup
3628 * (if not done yet, i.e. it's the first error) */
3629 if (known_errorHandler==-1) {
3630 if ((errors==NULL) || (!strcmp(errors, "strict")))
3631 known_errorHandler = 1;
3632 else if (!strcmp(errors, "replace"))
3633 known_errorHandler = 2;
3634 else if (!strcmp(errors, "ignore"))
3635 known_errorHandler = 3;
3636 else if (!strcmp(errors, "xmlcharrefreplace"))
3637 known_errorHandler = 4;
3638 else
3639 known_errorHandler = 0;
3640 }
3641 switch (known_errorHandler) {
3642 case 1: /* strict */
3643 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3644 goto onError;
3645 case 2: /* replace */
3646 /* No need to check for space, this is a 1:1 replacement */
3647 for (coll = collstart; coll<collend; ++coll)
3648 *str++ = '?';
3649 /* fall through */
3650 case 3: /* ignore */
3651 p = collend;
3652 break;
3653 case 4: /* xmlcharrefreplace */
3654 /* generate replacement (temporarily (mis)uses p) */
3655 for (p = collstart; p < collend; ++p) {
3656 char buffer[2+29+1+1];
3657 char *cp;
3658 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003659 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3661 goto onError;
3662 for (cp = buffer; *cp; ++cp)
3663 *str++ = *cp;
3664 }
3665 p = collend;
3666 break;
3667 default:
3668 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3669 reason, startp, size, &exc,
3670 collstart-startp, collend-startp, &newpos);
3671 if (repunicode == NULL)
3672 goto onError;
3673 /* generate replacement */
3674 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003675 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3677 Py_DECREF(repunicode);
3678 goto onError;
3679 }
3680 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3681 *str++ = *uni2;
3682 p = startp + newpos;
3683 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 }
3685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 /* Resize if we allocated to much */
3688 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003689 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003690 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 }
3693 Py_XDECREF(exc);
3694 Py_XDECREF(errorHandler);
3695 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 onError:
3698 Py_XDECREF(res);
3699 Py_XDECREF(exc);
3700 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 return NULL;
3702}
3703
3704PyObject *PyUnicode_Translate(PyObject *str,
3705 PyObject *mapping,
3706 const char *errors)
3707{
3708 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003709
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 str = PyUnicode_FromObject(str);
3711 if (str == NULL)
3712 goto onError;
3713 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3714 PyUnicode_GET_SIZE(str),
3715 mapping,
3716 errors);
3717 Py_DECREF(str);
3718 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003719
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 onError:
3721 Py_XDECREF(str);
3722 return NULL;
3723}
Tim Petersced69f82003-09-16 20:30:58 +00003724
Guido van Rossum9e896b32000-04-05 20:11:21 +00003725/* --- Decimal Encoder ---------------------------------------------------- */
3726
3727int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003728 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003729 char *output,
3730 const char *errors)
3731{
3732 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 PyObject *errorHandler = NULL;
3734 PyObject *exc = NULL;
3735 const char *encoding = "decimal";
3736 const char *reason = "invalid decimal Unicode string";
3737 /* the following variable is used for caching string comparisons
3738 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3739 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003740
3741 if (output == NULL) {
3742 PyErr_BadArgument();
3743 return -1;
3744 }
3745
3746 p = s;
3747 end = s + length;
3748 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003750 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003752 Py_ssize_t repsize;
3753 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 Py_UNICODE *uni2;
3755 Py_UNICODE *collstart;
3756 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003757
Guido van Rossum9e896b32000-04-05 20:11:21 +00003758 if (Py_UNICODE_ISSPACE(ch)) {
3759 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003761 continue;
3762 }
3763 decimal = Py_UNICODE_TODECIMAL(ch);
3764 if (decimal >= 0) {
3765 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003767 continue;
3768 }
Guido van Rossumba477042000-04-06 18:18:10 +00003769 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003770 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003772 continue;
3773 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 /* All other characters are considered unencodable */
3775 collstart = p;
3776 collend = p+1;
3777 while (collend < end) {
3778 if ((0 < *collend && *collend < 256) ||
3779 !Py_UNICODE_ISSPACE(*collend) ||
3780 Py_UNICODE_TODECIMAL(*collend))
3781 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003782 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 /* cache callback name lookup
3784 * (if not done yet, i.e. it's the first error) */
3785 if (known_errorHandler==-1) {
3786 if ((errors==NULL) || (!strcmp(errors, "strict")))
3787 known_errorHandler = 1;
3788 else if (!strcmp(errors, "replace"))
3789 known_errorHandler = 2;
3790 else if (!strcmp(errors, "ignore"))
3791 known_errorHandler = 3;
3792 else if (!strcmp(errors, "xmlcharrefreplace"))
3793 known_errorHandler = 4;
3794 else
3795 known_errorHandler = 0;
3796 }
3797 switch (known_errorHandler) {
3798 case 1: /* strict */
3799 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3800 goto onError;
3801 case 2: /* replace */
3802 for (p = collstart; p < collend; ++p)
3803 *output++ = '?';
3804 /* fall through */
3805 case 3: /* ignore */
3806 p = collend;
3807 break;
3808 case 4: /* xmlcharrefreplace */
3809 /* generate replacement (temporarily (mis)uses p) */
3810 for (p = collstart; p < collend; ++p)
3811 output += sprintf(output, "&#%d;", (int)*p);
3812 p = collend;
3813 break;
3814 default:
3815 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3816 encoding, reason, s, length, &exc,
3817 collstart-s, collend-s, &newpos);
3818 if (repunicode == NULL)
3819 goto onError;
3820 /* generate replacement */
3821 repsize = PyUnicode_GET_SIZE(repunicode);
3822 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3823 Py_UNICODE ch = *uni2;
3824 if (Py_UNICODE_ISSPACE(ch))
3825 *output++ = ' ';
3826 else {
3827 decimal = Py_UNICODE_TODECIMAL(ch);
3828 if (decimal >= 0)
3829 *output++ = '0' + decimal;
3830 else if (0 < ch && ch < 256)
3831 *output++ = (char)ch;
3832 else {
3833 Py_DECREF(repunicode);
3834 raise_encode_exception(&exc, encoding,
3835 s, length, collstart-s, collend-s, reason);
3836 goto onError;
3837 }
3838 }
3839 }
3840 p = s + newpos;
3841 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003842 }
3843 }
3844 /* 0-terminate the output string */
3845 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003846 Py_XDECREF(exc);
3847 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003848 return 0;
3849
3850 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851 Py_XDECREF(exc);
3852 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003853 return -1;
3854}
3855
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856/* --- Helpers ------------------------------------------------------------ */
3857
Fredrik Lundha50d2012006-05-26 17:04:58 +00003858#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00003859
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003860#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00003861#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003862#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00003863
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00003864Py_LOCAL(int)
3865STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
3866{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00003867 if (str[0] != other[0])
3868 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00003869 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
3870}
3871
Fredrik Lundhb9479482006-05-26 17:22:38 +00003872#define STRINGLIB_EMPTY unicode_empty
3873
Fredrik Lundha50d2012006-05-26 17:04:58 +00003874#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00003875
3876#include "stringlib/count.h"
3877#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00003878#include "stringlib/partition.h"
3879
Fredrik Lundhc8162812006-05-26 19:33:03 +00003880/* helper macro to fixup start/end slice values */
3881#define FIX_START_END(obj) \
3882 if (start < 0) \
3883 start += (obj)->length; \
3884 if (start < 0) \
3885 start = 0; \
3886 if (end > (obj)->length) \
3887 end = (obj)->length; \
3888 if (end < 0) \
3889 end += (obj)->length; \
3890 if (end < 0) \
3891 end = 0;
3892
Martin v. Löwis18e16552006-02-15 17:27:45 +00003893Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00003894 PyObject *substr,
3895 Py_ssize_t start,
3896 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003898 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00003899 PyUnicodeObject* str_obj;
3900 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Fredrik Lundh58b5e842006-05-26 19:24:53 +00003902 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
3903 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00003905 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
3906 if (!sub_obj) {
3907 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 return -1;
3909 }
Tim Petersced69f82003-09-16 20:30:58 +00003910
Fredrik Lundhc8162812006-05-26 19:33:03 +00003911 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00003912
Fredrik Lundh58b5e842006-05-26 19:24:53 +00003913 result = stringlib_count(
3914 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
3915 );
3916
3917 Py_DECREF(sub_obj);
3918 Py_DECREF(str_obj);
3919
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 return result;
3921}
3922
Martin v. Löwis18e16552006-02-15 17:27:45 +00003923Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003924 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00003925 Py_ssize_t start,
3926 Py_ssize_t end,
3927 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003929 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003930
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003931 str = PyUnicode_FromObject(str);
3932 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003933 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003934 sub = PyUnicode_FromObject(sub);
3935 if (!sub) {
3936 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003937 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 }
Tim Petersced69f82003-09-16 20:30:58 +00003939
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003940 FIX_START_END((PyUnicodeObject*) str);
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00003941
3942 if (direction > 0)
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003943 result = stringlib_find_obj(str, sub, start, end);
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00003944 else
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003945 result = stringlib_rfind_obj(str, sub, start, end);
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00003946
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00003947 Py_DECREF(str);
3948 Py_DECREF(sub);
3949
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 return result;
3951}
3952
Tim Petersced69f82003-09-16 20:30:58 +00003953static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954int tailmatch(PyUnicodeObject *self,
3955 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003956 Py_ssize_t start,
3957 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 int direction)
3959{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 if (substring->length == 0)
3961 return 1;
3962
Fredrik Lundhc8162812006-05-26 19:33:03 +00003963 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964
3965 end -= substring->length;
3966 if (end < start)
3967 return 0;
3968
3969 if (direction > 0) {
3970 if (Py_UNICODE_MATCH(self, end, substring))
3971 return 1;
3972 } else {
3973 if (Py_UNICODE_MATCH(self, start, substring))
3974 return 1;
3975 }
3976
3977 return 0;
3978}
3979
Martin v. Löwis18e16552006-02-15 17:27:45 +00003980Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003982 Py_ssize_t start,
3983 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 int direction)
3985{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003986 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003987
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 str = PyUnicode_FromObject(str);
3989 if (str == NULL)
3990 return -1;
3991 substr = PyUnicode_FromObject(substr);
3992 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003993 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 return -1;
3995 }
Tim Petersced69f82003-09-16 20:30:58 +00003996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 result = tailmatch((PyUnicodeObject *)str,
3998 (PyUnicodeObject *)substr,
3999 start, end, direction);
4000 Py_DECREF(str);
4001 Py_DECREF(substr);
4002 return result;
4003}
4004
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005/* Apply fixfct filter to the Unicode object self and return a
4006 reference to the modified object */
4007
Tim Petersced69f82003-09-16 20:30:58 +00004008static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009PyObject *fixup(PyUnicodeObject *self,
4010 int (*fixfct)(PyUnicodeObject *s))
4011{
4012
4013 PyUnicodeObject *u;
4014
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004015 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 if (u == NULL)
4017 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004018
4019 Py_UNICODE_COPY(u->str, self->str, self->length);
4020
Tim Peters7a29bd52001-09-12 03:03:31 +00004021 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 /* fixfct should return TRUE if it modified the buffer. If
4023 FALSE, return a reference to the original buffer instead
4024 (to save space, not time) */
4025 Py_INCREF(self);
4026 Py_DECREF(u);
4027 return (PyObject*) self;
4028 }
4029 return (PyObject*) u;
4030}
4031
Tim Petersced69f82003-09-16 20:30:58 +00004032static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033int fixupper(PyUnicodeObject *self)
4034{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004035 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 Py_UNICODE *s = self->str;
4037 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004038
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 while (len-- > 0) {
4040 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004041
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 ch = Py_UNICODE_TOUPPER(*s);
4043 if (ch != *s) {
4044 status = 1;
4045 *s = ch;
4046 }
4047 s++;
4048 }
4049
4050 return status;
4051}
4052
Tim Petersced69f82003-09-16 20:30:58 +00004053static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054int fixlower(PyUnicodeObject *self)
4055{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004056 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 Py_UNICODE *s = self->str;
4058 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004059
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 while (len-- > 0) {
4061 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004062
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 ch = Py_UNICODE_TOLOWER(*s);
4064 if (ch != *s) {
4065 status = 1;
4066 *s = ch;
4067 }
4068 s++;
4069 }
4070
4071 return status;
4072}
4073
Tim Petersced69f82003-09-16 20:30:58 +00004074static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075int fixswapcase(PyUnicodeObject *self)
4076{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004077 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 Py_UNICODE *s = self->str;
4079 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004080
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 while (len-- > 0) {
4082 if (Py_UNICODE_ISUPPER(*s)) {
4083 *s = Py_UNICODE_TOLOWER(*s);
4084 status = 1;
4085 } else if (Py_UNICODE_ISLOWER(*s)) {
4086 *s = Py_UNICODE_TOUPPER(*s);
4087 status = 1;
4088 }
4089 s++;
4090 }
4091
4092 return status;
4093}
4094
Tim Petersced69f82003-09-16 20:30:58 +00004095static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096int fixcapitalize(PyUnicodeObject *self)
4097{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004098 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004099 Py_UNICODE *s = self->str;
4100 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004101
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004102 if (len == 0)
4103 return 0;
4104 if (Py_UNICODE_ISLOWER(*s)) {
4105 *s = Py_UNICODE_TOUPPER(*s);
4106 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004108 s++;
4109 while (--len > 0) {
4110 if (Py_UNICODE_ISUPPER(*s)) {
4111 *s = Py_UNICODE_TOLOWER(*s);
4112 status = 1;
4113 }
4114 s++;
4115 }
4116 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117}
4118
4119static
4120int fixtitle(PyUnicodeObject *self)
4121{
4122 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4123 register Py_UNICODE *e;
4124 int previous_is_cased;
4125
4126 /* Shortcut for single character strings */
4127 if (PyUnicode_GET_SIZE(self) == 1) {
4128 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4129 if (*p != ch) {
4130 *p = ch;
4131 return 1;
4132 }
4133 else
4134 return 0;
4135 }
Tim Petersced69f82003-09-16 20:30:58 +00004136
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 e = p + PyUnicode_GET_SIZE(self);
4138 previous_is_cased = 0;
4139 for (; p < e; p++) {
4140 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004141
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 if (previous_is_cased)
4143 *p = Py_UNICODE_TOLOWER(ch);
4144 else
4145 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004146
4147 if (Py_UNICODE_ISLOWER(ch) ||
4148 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 Py_UNICODE_ISTITLE(ch))
4150 previous_is_cased = 1;
4151 else
4152 previous_is_cased = 0;
4153 }
4154 return 1;
4155}
4156
Tim Peters8ce9f162004-08-27 01:49:32 +00004157PyObject *
4158PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159{
Tim Peters8ce9f162004-08-27 01:49:32 +00004160 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004161 const Py_UNICODE blank = ' ';
4162 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004163 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004164 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004165 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4166 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004167 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4168 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004169 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004170 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004171 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172
Tim Peters05eba1f2004-08-27 21:32:02 +00004173 fseq = PySequence_Fast(seq, "");
4174 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004175 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004176 }
4177
Tim Peters91879ab2004-08-27 22:35:44 +00004178 /* Grrrr. A codec may be invoked to convert str objects to
4179 * Unicode, and so it's possible to call back into Python code
4180 * during PyUnicode_FromObject(), and so it's possible for a sick
4181 * codec to change the size of fseq (if seq is a list). Therefore
4182 * we have to keep refetching the size -- can't assume seqlen
4183 * is invariant.
4184 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004185 seqlen = PySequence_Fast_GET_SIZE(fseq);
4186 /* If empty sequence, return u"". */
4187 if (seqlen == 0) {
4188 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4189 goto Done;
4190 }
4191 /* If singleton sequence with an exact Unicode, return that. */
4192 if (seqlen == 1) {
4193 item = PySequence_Fast_GET_ITEM(fseq, 0);
4194 if (PyUnicode_CheckExact(item)) {
4195 Py_INCREF(item);
4196 res = (PyUnicodeObject *)item;
4197 goto Done;
4198 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004199 }
4200
Tim Peters05eba1f2004-08-27 21:32:02 +00004201 /* At least two items to join, or one that isn't exact Unicode. */
4202 if (seqlen > 1) {
4203 /* Set up sep and seplen -- they're needed. */
4204 if (separator == NULL) {
4205 sep = &blank;
4206 seplen = 1;
4207 }
4208 else {
4209 internal_separator = PyUnicode_FromObject(separator);
4210 if (internal_separator == NULL)
4211 goto onError;
4212 sep = PyUnicode_AS_UNICODE(internal_separator);
4213 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004214 /* In case PyUnicode_FromObject() mutated seq. */
4215 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004216 }
4217 }
4218
4219 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004220 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004221 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004222 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004223 res_p = PyUnicode_AS_UNICODE(res);
4224 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004225
Tim Peters05eba1f2004-08-27 21:32:02 +00004226 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004227 Py_ssize_t itemlen;
4228 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004229
4230 item = PySequence_Fast_GET_ITEM(fseq, i);
4231 /* Convert item to Unicode. */
4232 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4233 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004234 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004235 " %.80s found",
4236 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004237 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004238 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004239 item = PyUnicode_FromObject(item);
4240 if (item == NULL)
4241 goto onError;
4242 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004243
Tim Peters91879ab2004-08-27 22:35:44 +00004244 /* In case PyUnicode_FromObject() mutated seq. */
4245 seqlen = PySequence_Fast_GET_SIZE(fseq);
4246
Tim Peters8ce9f162004-08-27 01:49:32 +00004247 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004249 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004250 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004251 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004252 if (i < seqlen - 1) {
4253 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004254 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004255 goto Overflow;
4256 }
4257 if (new_res_used > res_alloc) {
4258 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004259 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004260 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004261 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004262 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004263 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004264 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004265 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004267 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004268 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004270
4271 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004272 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004273 res_p += itemlen;
4274 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004275 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004276 res_p += seplen;
4277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004279 res_used = new_res_used;
4280 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004281
Tim Peters05eba1f2004-08-27 21:32:02 +00004282 /* Shrink res to match the used area; this probably can't fail,
4283 * but it's cheap to check.
4284 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004285 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004286 goto onError;
4287
4288 Done:
4289 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004290 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return (PyObject *)res;
4292
Tim Peters8ce9f162004-08-27 01:49:32 +00004293 Overflow:
4294 PyErr_SetString(PyExc_OverflowError,
4295 "join() is too long for a Python string");
4296 Py_DECREF(item);
4297 /* fall through */
4298
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004300 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004301 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004302 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 return NULL;
4304}
4305
Tim Petersced69f82003-09-16 20:30:58 +00004306static
4307PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004308 Py_ssize_t left,
4309 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 Py_UNICODE fill)
4311{
4312 PyUnicodeObject *u;
4313
4314 if (left < 0)
4315 left = 0;
4316 if (right < 0)
4317 right = 0;
4318
Tim Peters7a29bd52001-09-12 03:03:31 +00004319 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 Py_INCREF(self);
4321 return self;
4322 }
4323
4324 u = _PyUnicode_New(left + self->length + right);
4325 if (u) {
4326 if (left)
4327 Py_UNICODE_FILL(u->str, fill, left);
4328 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4329 if (right)
4330 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4331 }
4332
4333 return u;
4334}
4335
4336#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004337 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 if (!str) \
4339 goto onError; \
4340 if (PyList_Append(list, str)) { \
4341 Py_DECREF(str); \
4342 goto onError; \
4343 } \
4344 else \
4345 Py_DECREF(str);
4346
4347static
4348PyObject *split_whitespace(PyUnicodeObject *self,
4349 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004352 register Py_ssize_t i;
4353 register Py_ssize_t j;
4354 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 PyObject *str;
4356
4357 for (i = j = 0; i < len; ) {
4358 /* find a token */
4359 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4360 i++;
4361 j = i;
4362 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4363 i++;
4364 if (j < i) {
4365 if (maxcount-- <= 0)
4366 break;
4367 SPLIT_APPEND(self->str, j, i);
4368 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4369 i++;
4370 j = i;
4371 }
4372 }
4373 if (j < len) {
4374 SPLIT_APPEND(self->str, j, len);
4375 }
4376 return list;
4377
4378 onError:
4379 Py_DECREF(list);
4380 return NULL;
4381}
4382
4383PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004384 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004386 register Py_ssize_t i;
4387 register Py_ssize_t j;
4388 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389 PyObject *list;
4390 PyObject *str;
4391 Py_UNICODE *data;
4392
4393 string = PyUnicode_FromObject(string);
4394 if (string == NULL)
4395 return NULL;
4396 data = PyUnicode_AS_UNICODE(string);
4397 len = PyUnicode_GET_SIZE(string);
4398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 list = PyList_New(0);
4400 if (!list)
4401 goto onError;
4402
4403 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004404 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004405
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004407 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
4410 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004411 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 if (i < len) {
4413 if (data[i] == '\r' && i + 1 < len &&
4414 data[i+1] == '\n')
4415 i += 2;
4416 else
4417 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004418 if (keepends)
4419 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 }
Guido van Rossum86662912000-04-11 15:38:46 +00004421 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 j = i;
4423 }
4424 if (j < len) {
4425 SPLIT_APPEND(data, j, len);
4426 }
4427
4428 Py_DECREF(string);
4429 return list;
4430
4431 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004432 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 Py_DECREF(string);
4434 return NULL;
4435}
4436
Tim Petersced69f82003-09-16 20:30:58 +00004437static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438PyObject *split_char(PyUnicodeObject *self,
4439 PyObject *list,
4440 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004441 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004443 register Py_ssize_t i;
4444 register Py_ssize_t j;
4445 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 PyObject *str;
4447
4448 for (i = j = 0; i < len; ) {
4449 if (self->str[i] == ch) {
4450 if (maxcount-- <= 0)
4451 break;
4452 SPLIT_APPEND(self->str, j, i);
4453 i = j = i + 1;
4454 } else
4455 i++;
4456 }
4457 if (j <= len) {
4458 SPLIT_APPEND(self->str, j, len);
4459 }
4460 return list;
4461
4462 onError:
4463 Py_DECREF(list);
4464 return NULL;
4465}
4466
Tim Petersced69f82003-09-16 20:30:58 +00004467static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468PyObject *split_substring(PyUnicodeObject *self,
4469 PyObject *list,
4470 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004471 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004473 register Py_ssize_t i;
4474 register Py_ssize_t j;
4475 Py_ssize_t len = self->length;
4476 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 PyObject *str;
4478
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004479 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (Py_UNICODE_MATCH(self, i, substring)) {
4481 if (maxcount-- <= 0)
4482 break;
4483 SPLIT_APPEND(self->str, j, i);
4484 i = j = i + sublen;
4485 } else
4486 i++;
4487 }
4488 if (j <= len) {
4489 SPLIT_APPEND(self->str, j, len);
4490 }
4491 return list;
4492
4493 onError:
4494 Py_DECREF(list);
4495 return NULL;
4496}
4497
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004498static
4499PyObject *rsplit_whitespace(PyUnicodeObject *self,
4500 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004501 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004503 register Py_ssize_t i;
4504 register Py_ssize_t j;
4505 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004506 PyObject *str;
4507
4508 for (i = j = len - 1; i >= 0; ) {
4509 /* find a token */
4510 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4511 i--;
4512 j = i;
4513 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4514 i--;
4515 if (j > i) {
4516 if (maxcount-- <= 0)
4517 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004518 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004519 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4520 i--;
4521 j = i;
4522 }
4523 }
4524 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004525 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004526 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004527 if (PyList_Reverse(list) < 0)
4528 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004529 return list;
4530
4531 onError:
4532 Py_DECREF(list);
4533 return NULL;
4534}
4535
4536static
4537PyObject *rsplit_char(PyUnicodeObject *self,
4538 PyObject *list,
4539 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004540 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004541{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004542 register Py_ssize_t i;
4543 register Py_ssize_t j;
4544 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004545 PyObject *str;
4546
4547 for (i = j = len - 1; i >= 0; ) {
4548 if (self->str[i] == ch) {
4549 if (maxcount-- <= 0)
4550 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004551 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004552 j = i = i - 1;
4553 } else
4554 i--;
4555 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004556 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004557 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004558 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004559 if (PyList_Reverse(list) < 0)
4560 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004561 return list;
4562
4563 onError:
4564 Py_DECREF(list);
4565 return NULL;
4566}
4567
4568static
4569PyObject *rsplit_substring(PyUnicodeObject *self,
4570 PyObject *list,
4571 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004572 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004573{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574 register Py_ssize_t i;
4575 register Py_ssize_t j;
4576 Py_ssize_t len = self->length;
4577 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004578 PyObject *str;
4579
4580 for (i = len - sublen, j = len; i >= 0; ) {
4581 if (Py_UNICODE_MATCH(self, i, substring)) {
4582 if (maxcount-- <= 0)
4583 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004584 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004585 j = i;
4586 i -= sublen;
4587 } else
4588 i--;
4589 }
4590 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004591 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004592 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004593 if (PyList_Reverse(list) < 0)
4594 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004595 return list;
4596
4597 onError:
4598 Py_DECREF(list);
4599 return NULL;
4600}
4601
Guido van Rossumd57fd912000-03-10 22:53:23 +00004602#undef SPLIT_APPEND
4603
4604static
4605PyObject *split(PyUnicodeObject *self,
4606 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608{
4609 PyObject *list;
4610
4611 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004612 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613
4614 list = PyList_New(0);
4615 if (!list)
4616 return NULL;
4617
4618 if (substring == NULL)
4619 return split_whitespace(self,list,maxcount);
4620
4621 else if (substring->length == 1)
4622 return split_char(self,list,substring->str[0],maxcount);
4623
4624 else if (substring->length == 0) {
4625 Py_DECREF(list);
4626 PyErr_SetString(PyExc_ValueError, "empty separator");
4627 return NULL;
4628 }
4629 else
4630 return split_substring(self,list,substring,maxcount);
4631}
4632
Tim Petersced69f82003-09-16 20:30:58 +00004633static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004634PyObject *rsplit(PyUnicodeObject *self,
4635 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004636 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004637{
4638 PyObject *list;
4639
4640 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004641 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004642
4643 list = PyList_New(0);
4644 if (!list)
4645 return NULL;
4646
4647 if (substring == NULL)
4648 return rsplit_whitespace(self,list,maxcount);
4649
4650 else if (substring->length == 1)
4651 return rsplit_char(self,list,substring->str[0],maxcount);
4652
4653 else if (substring->length == 0) {
4654 Py_DECREF(list);
4655 PyErr_SetString(PyExc_ValueError, "empty separator");
4656 return NULL;
4657 }
4658 else
4659 return rsplit_substring(self,list,substring,maxcount);
4660}
4661
4662static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663PyObject *replace(PyUnicodeObject *self,
4664 PyUnicodeObject *str1,
4665 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004666 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667{
4668 PyUnicodeObject *u;
4669
4670 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004671 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672
Fredrik Lundh347ee272006-05-24 16:35:18 +00004673 if (str1->length == str2->length) {
4674 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004675 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004676 if (str1->length == 1) {
4677 /* replace characters */
4678 Py_UNICODE u1, u2;
4679 if (!findchar(self->str, self->length, str1->str[0]))
4680 goto nothing;
4681 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4682 if (!u)
4683 return NULL;
4684 Py_UNICODE_COPY(u->str, self->str, self->length);
4685 u1 = str1->str[0];
4686 u2 = str2->str[0];
4687 for (i = 0; i < u->length; i++)
4688 if (u->str[i] == u1) {
4689 if (--maxcount < 0)
4690 break;
4691 u->str[i] = u2;
4692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004694 i = fastsearch(
4695 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00004697 if (i < 0)
4698 goto nothing;
4699 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4700 if (!u)
4701 return NULL;
4702 Py_UNICODE_COPY(u->str, self->str, self->length);
4703 while (i <= self->length - str1->length)
4704 if (Py_UNICODE_MATCH(self, i, str1)) {
4705 if (--maxcount < 0)
4706 break;
4707 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
4708 i += str1->length;
4709 } else
4710 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00004713
Martin v. Löwis18e16552006-02-15 17:27:45 +00004714 Py_ssize_t n, i;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004715 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 Py_UNICODE *p;
4717
4718 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004719 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 if (n > maxcount)
4721 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004722 if (n == 0)
4723 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00004724 /* new_size = self->length + n * (str2->length - str1->length)); */
4725 delta = (str2->length - str1->length);
4726 if (delta == 0) {
4727 new_size = self->length;
4728 } else {
4729 product = n * (str2->length - str1->length);
4730 if ((product / (str2->length - str1->length)) != n) {
4731 PyErr_SetString(PyExc_OverflowError,
4732 "replace string is too long");
4733 return NULL;
4734 }
4735 new_size = self->length + product;
4736 if (new_size < 0) {
4737 PyErr_SetString(PyExc_OverflowError,
4738 "replace string is too long");
4739 return NULL;
4740 }
4741 }
4742 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00004743 if (!u)
4744 return NULL;
4745 i = 0;
4746 p = u->str;
4747 if (str1->length > 0) {
4748 while (i <= self->length - str1->length)
4749 if (Py_UNICODE_MATCH(self, i, str1)) {
4750 /* replace string segment */
4751 Py_UNICODE_COPY(p, str2->str, str2->length);
4752 p += str2->length;
4753 i += str1->length;
4754 if (--n <= 0) {
4755 /* copy remaining part */
4756 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4757 break;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004758 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004759 } else
4760 *p++ = self->str[i++];
4761 } else {
4762 while (n > 0) {
4763 Py_UNICODE_COPY(p, str2->str, str2->length);
4764 p += str2->length;
4765 if (--n <= 0)
4766 break;
4767 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00004769 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 }
4771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00004773
4774nothing:
4775 /* nothing to replace; return original string (when possible) */
4776 if (PyUnicode_CheckExact(self)) {
4777 Py_INCREF(self);
4778 return (PyObject *) self;
4779 }
4780 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781}
4782
4783/* --- Unicode Object Methods --------------------------------------------- */
4784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004785PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786"S.title() -> unicode\n\
4787\n\
4788Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004789characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
4791static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004792unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 return fixup(self, fixtitle);
4795}
4796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004797PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798"S.capitalize() -> unicode\n\
4799\n\
4800Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004801have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802
4803static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004804unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 return fixup(self, fixcapitalize);
4807}
4808
4809#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004810PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811"S.capwords() -> unicode\n\
4812\n\
4813Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004814normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815
4816static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004817unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818{
4819 PyObject *list;
4820 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004821 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 /* Split into words */
4824 list = split(self, NULL, -1);
4825 if (!list)
4826 return NULL;
4827
4828 /* Capitalize each word */
4829 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4830 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4831 fixcapitalize);
4832 if (item == NULL)
4833 goto onError;
4834 Py_DECREF(PyList_GET_ITEM(list, i));
4835 PyList_SET_ITEM(list, i, item);
4836 }
4837
4838 /* Join the words to form a new string */
4839 item = PyUnicode_Join(NULL, list);
4840
4841onError:
4842 Py_DECREF(list);
4843 return (PyObject *)item;
4844}
4845#endif
4846
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004847/* Argument converter. Coerces to a single unicode character */
4848
4849static int
4850convert_uc(PyObject *obj, void *addr)
4851{
4852 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4853 PyObject *uniobj;
4854 Py_UNICODE *unistr;
4855
4856 uniobj = PyUnicode_FromObject(obj);
4857 if (uniobj == NULL) {
4858 PyErr_SetString(PyExc_TypeError,
4859 "The fill character cannot be converted to Unicode");
4860 return 0;
4861 }
4862 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4863 PyErr_SetString(PyExc_TypeError,
4864 "The fill character must be exactly one character long");
4865 Py_DECREF(uniobj);
4866 return 0;
4867 }
4868 unistr = PyUnicode_AS_UNICODE(uniobj);
4869 *fillcharloc = unistr[0];
4870 Py_DECREF(uniobj);
4871 return 1;
4872}
4873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004874PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004875"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004877Return S centered in a Unicode string of length width. Padding is\n\
4878done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
4880static PyObject *
4881unicode_center(PyUnicodeObject *self, PyObject *args)
4882{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004883 Py_ssize_t marg, left;
4884 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004885 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886
Thomas Woutersde017742006-02-16 19:34:37 +00004887 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 return NULL;
4889
Tim Peters7a29bd52001-09-12 03:03:31 +00004890 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 Py_INCREF(self);
4892 return (PyObject*) self;
4893 }
4894
4895 marg = width - self->length;
4896 left = marg / 2 + (marg & width & 1);
4897
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004898 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899}
4900
Marc-André Lemburge5034372000-08-08 08:04:29 +00004901#if 0
4902
4903/* This code should go into some future Unicode collation support
4904 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004905 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004906
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004907/* speedy UTF-16 code point order comparison */
4908/* gleaned from: */
4909/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4910
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004911static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004912{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004913 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004914 0, 0, 0, 0, 0, 0, 0, 0,
4915 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004916 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004917};
4918
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919static int
4920unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4921{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004922 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004923
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 Py_UNICODE *s1 = str1->str;
4925 Py_UNICODE *s2 = str2->str;
4926
4927 len1 = str1->length;
4928 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004929
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004931 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004932
4933 c1 = *s1++;
4934 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004935
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004936 if (c1 > (1<<11) * 26)
4937 c1 += utf16Fixup[c1>>11];
4938 if (c2 > (1<<11) * 26)
4939 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004940 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004941
4942 if (c1 != c2)
4943 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004944
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004945 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946 }
4947
4948 return (len1 < len2) ? -1 : (len1 != len2);
4949}
4950
Marc-André Lemburge5034372000-08-08 08:04:29 +00004951#else
4952
4953static int
4954unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004956 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004957
4958 Py_UNICODE *s1 = str1->str;
4959 Py_UNICODE *s2 = str2->str;
4960
4961 len1 = str1->length;
4962 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004963
Marc-André Lemburge5034372000-08-08 08:04:29 +00004964 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004965 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004966
Fredrik Lundh45714e92001-06-26 16:39:36 +00004967 c1 = *s1++;
4968 c2 = *s2++;
4969
4970 if (c1 != c2)
4971 return (c1 < c2) ? -1 : 1;
4972
Marc-André Lemburge5034372000-08-08 08:04:29 +00004973 len1--; len2--;
4974 }
4975
4976 return (len1 < len2) ? -1 : (len1 != len2);
4977}
4978
4979#endif
4980
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981int PyUnicode_Compare(PyObject *left,
4982 PyObject *right)
4983{
4984 PyUnicodeObject *u = NULL, *v = NULL;
4985 int result;
4986
4987 /* Coerce the two arguments */
4988 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4989 if (u == NULL)
4990 goto onError;
4991 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4992 if (v == NULL)
4993 goto onError;
4994
Thomas Wouters7e474022000-07-16 12:04:32 +00004995 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 if (v == u) {
4997 Py_DECREF(u);
4998 Py_DECREF(v);
4999 return 0;
5000 }
5001
5002 result = unicode_compare(u, v);
5003
5004 Py_DECREF(u);
5005 Py_DECREF(v);
5006 return result;
5007
5008onError:
5009 Py_XDECREF(u);
5010 Py_XDECREF(v);
5011 return -1;
5012}
5013
Guido van Rossum403d68b2000-03-13 15:55:09 +00005014int PyUnicode_Contains(PyObject *container,
5015 PyObject *element)
5016{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005017 PyObject *str, *sub;
5018 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005019
5020 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005021 sub = PyUnicode_FromObject(element);
5022 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005023 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005024 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005025 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005026 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005027
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005028 str = PyUnicode_FromObject(container);
5029 if (!str) {
5030 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005031 return -1;
5032 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005033
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005034 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005035
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005036 Py_DECREF(str);
5037 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005038
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005039 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005040}
5041
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042/* Concat to string or Unicode object giving a new Unicode object. */
5043
5044PyObject *PyUnicode_Concat(PyObject *left,
5045 PyObject *right)
5046{
5047 PyUnicodeObject *u = NULL, *v = NULL, *w;
5048
5049 /* Coerce the two arguments */
5050 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5051 if (u == NULL)
5052 goto onError;
5053 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5054 if (v == NULL)
5055 goto onError;
5056
5057 /* Shortcuts */
5058 if (v == unicode_empty) {
5059 Py_DECREF(v);
5060 return (PyObject *)u;
5061 }
5062 if (u == unicode_empty) {
5063 Py_DECREF(u);
5064 return (PyObject *)v;
5065 }
5066
5067 /* Concat the two Unicode strings */
5068 w = _PyUnicode_New(u->length + v->length);
5069 if (w == NULL)
5070 goto onError;
5071 Py_UNICODE_COPY(w->str, u->str, u->length);
5072 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5073
5074 Py_DECREF(u);
5075 Py_DECREF(v);
5076 return (PyObject *)w;
5077
5078onError:
5079 Py_XDECREF(u);
5080 Py_XDECREF(v);
5081 return NULL;
5082}
5083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005084PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085"S.count(sub[, start[, end]]) -> int\n\
5086\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005087Return the number of non-overlapping occurrences of substring sub in\n\
5088Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005089interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090
5091static PyObject *
5092unicode_count(PyUnicodeObject *self, PyObject *args)
5093{
5094 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005095 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005096 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 PyObject *result;
5098
Guido van Rossumb8872e62000-05-09 14:14:27 +00005099 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5100 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 return NULL;
5102
5103 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005104 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 if (substring == NULL)
5106 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005107
Fredrik Lundhc8162812006-05-26 19:33:03 +00005108 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005110 result = PyInt_FromSsize_t(
5111 stringlib_count(self->str + start, end - start,
5112 substring->str, substring->length)
5113 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114
5115 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005116
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 return result;
5118}
5119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005120PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005121"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005123Encodes S using the codec registered for encoding. encoding defaults\n\
5124to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005125handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5127'xmlcharrefreplace' as well as any other name registered with\n\
5128codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129
5130static PyObject *
5131unicode_encode(PyUnicodeObject *self, PyObject *args)
5132{
5133 char *encoding = NULL;
5134 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005135 PyObject *v;
5136
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5138 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005139 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005140 if (v == NULL)
5141 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005142 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5143 PyErr_Format(PyExc_TypeError,
5144 "encoder did not return a string/unicode object "
5145 "(type=%.400s)",
5146 v->ob_type->tp_name);
5147 Py_DECREF(v);
5148 return NULL;
5149 }
5150 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005151
5152 onError:
5153 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005154}
5155
5156PyDoc_STRVAR(decode__doc__,
5157"S.decode([encoding[,errors]]) -> string or unicode\n\
5158\n\
5159Decodes S using the codec registered for encoding. encoding defaults\n\
5160to the default encoding. errors may be given to set a different error\n\
5161handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5162a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5163as well as any other name registerd with codecs.register_error that is\n\
5164able to handle UnicodeDecodeErrors.");
5165
5166static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005167unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005168{
5169 char *encoding = NULL;
5170 char *errors = NULL;
5171 PyObject *v;
5172
5173 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5174 return NULL;
5175 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005176 if (v == NULL)
5177 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005178 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5179 PyErr_Format(PyExc_TypeError,
5180 "decoder did not return a string/unicode object "
5181 "(type=%.400s)",
5182 v->ob_type->tp_name);
5183 Py_DECREF(v);
5184 return NULL;
5185 }
5186 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005187
5188 onError:
5189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190}
5191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005192PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193"S.expandtabs([tabsize]) -> unicode\n\
5194\n\
5195Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005196If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198static PyObject*
5199unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5200{
5201 Py_UNICODE *e;
5202 Py_UNICODE *p;
5203 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005204 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 PyUnicodeObject *u;
5206 int tabsize = 8;
5207
5208 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5209 return NULL;
5210
Thomas Wouters7e474022000-07-16 12:04:32 +00005211 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212 i = j = 0;
5213 e = self->str + self->length;
5214 for (p = self->str; p < e; p++)
5215 if (*p == '\t') {
5216 if (tabsize > 0)
5217 j += tabsize - (j % tabsize);
5218 }
5219 else {
5220 j++;
5221 if (*p == '\n' || *p == '\r') {
5222 i += j;
5223 j = 0;
5224 }
5225 }
5226
5227 /* Second pass: create output string and fill it */
5228 u = _PyUnicode_New(i + j);
5229 if (!u)
5230 return NULL;
5231
5232 j = 0;
5233 q = u->str;
5234
5235 for (p = self->str; p < e; p++)
5236 if (*p == '\t') {
5237 if (tabsize > 0) {
5238 i = tabsize - (j % tabsize);
5239 j += i;
5240 while (i--)
5241 *q++ = ' ';
5242 }
5243 }
5244 else {
5245 j++;
5246 *q++ = *p;
5247 if (*p == '\n' || *p == '\r')
5248 j = 0;
5249 }
5250
5251 return (PyObject*) u;
5252}
5253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005254PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255"S.find(sub [,start [,end]]) -> int\n\
5256\n\
5257Return the lowest index in S where substring sub is found,\n\
5258such that sub is contained within s[start,end]. Optional\n\
5259arguments start and end are interpreted as in slice notation.\n\
5260\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005261Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262
5263static PyObject *
5264unicode_find(PyUnicodeObject *self, PyObject *args)
5265{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005266 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005268 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005269 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270
Guido van Rossumb8872e62000-05-09 14:14:27 +00005271 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5272 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005274
5275 substring = PyUnicode_FromObject(substring);
5276 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 return NULL;
5278
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005279 FIX_START_END(self);
5280
5281 result = stringlib_find_obj((PyObject*) self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282
5283 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005284
5285 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286}
5287
5288static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005289unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290{
5291 if (index < 0 || index >= self->length) {
5292 PyErr_SetString(PyExc_IndexError, "string index out of range");
5293 return NULL;
5294 }
5295
5296 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5297}
5298
5299static long
5300unicode_hash(PyUnicodeObject *self)
5301{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005302 /* Since Unicode objects compare equal to their ASCII string
5303 counterparts, they should use the individual character values
5304 as basis for their hash value. This is needed to assure that
5305 strings and Unicode objects behave in the same way as
5306 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Martin v. Löwis18e16552006-02-15 17:27:45 +00005308 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005309 register Py_UNICODE *p;
5310 register long x;
5311
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 if (self->hash != -1)
5313 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005314 len = PyUnicode_GET_SIZE(self);
5315 p = PyUnicode_AS_UNICODE(self);
5316 x = *p << 7;
5317 while (--len >= 0)
5318 x = (1000003*x) ^ *p++;
5319 x ^= PyUnicode_GET_SIZE(self);
5320 if (x == -1)
5321 x = -2;
5322 self->hash = x;
5323 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324}
5325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005326PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327"S.index(sub [,start [,end]]) -> int\n\
5328\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005329Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330
5331static PyObject *
5332unicode_index(PyUnicodeObject *self, PyObject *args)
5333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005334 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005335 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005336 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005337 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338
Guido van Rossumb8872e62000-05-09 14:14:27 +00005339 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5340 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005342
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005343 substring = PyUnicode_FromObject(substring);
5344 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 return NULL;
5346
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005347 FIX_START_END(self);
5348
5349 result = stringlib_find_obj((PyObject*) self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350
5351 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005352
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 if (result < 0) {
5354 PyErr_SetString(PyExc_ValueError, "substring not found");
5355 return NULL;
5356 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005357
Martin v. Löwis18e16552006-02-15 17:27:45 +00005358 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359}
5360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005361PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005362"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005364Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005365at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366
5367static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005368unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369{
5370 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5371 register const Py_UNICODE *e;
5372 int cased;
5373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 /* Shortcut for single character strings */
5375 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005376 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005378 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005379 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005380 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005381
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 e = p + PyUnicode_GET_SIZE(self);
5383 cased = 0;
5384 for (; p < e; p++) {
5385 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005386
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005388 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 else if (!cased && Py_UNICODE_ISLOWER(ch))
5390 cased = 1;
5391 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005392 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393}
5394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005395PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005396"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005398Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005399at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400
5401static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005402unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403{
5404 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5405 register const Py_UNICODE *e;
5406 int cased;
5407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 /* Shortcut for single character strings */
5409 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005410 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005412 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005413 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005414 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005415
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416 e = p + PyUnicode_GET_SIZE(self);
5417 cased = 0;
5418 for (; p < e; p++) {
5419 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005420
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005422 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 else if (!cased && Py_UNICODE_ISUPPER(ch))
5424 cased = 1;
5425 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005426 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427}
5428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005429PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005430"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005432Return True if S is a titlecased string and there is at least one\n\
5433character in S, i.e. upper- and titlecase characters may only\n\
5434follow uncased characters and lowercase characters only cased ones.\n\
5435Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436
5437static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005438unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439{
5440 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5441 register const Py_UNICODE *e;
5442 int cased, previous_is_cased;
5443
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 /* Shortcut for single character strings */
5445 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005446 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5447 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005449 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005450 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005451 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 e = p + PyUnicode_GET_SIZE(self);
5454 cased = 0;
5455 previous_is_cased = 0;
5456 for (; p < e; p++) {
5457 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5460 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005461 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 previous_is_cased = 1;
5463 cased = 1;
5464 }
5465 else if (Py_UNICODE_ISLOWER(ch)) {
5466 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005467 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 previous_is_cased = 1;
5469 cased = 1;
5470 }
5471 else
5472 previous_is_cased = 0;
5473 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005474 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475}
5476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005477PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005478"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005480Return True if all characters in S are whitespace\n\
5481and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
5483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005484unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485{
5486 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5487 register const Py_UNICODE *e;
5488
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 /* Shortcut for single character strings */
5490 if (PyUnicode_GET_SIZE(self) == 1 &&
5491 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005492 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005494 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005495 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005496 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005497
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 e = p + PyUnicode_GET_SIZE(self);
5499 for (; p < e; p++) {
5500 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005501 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005503 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504}
5505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005506PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005507"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005508\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005509Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005510and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005511
5512static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005513unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005514{
5515 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5516 register const Py_UNICODE *e;
5517
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005518 /* Shortcut for single character strings */
5519 if (PyUnicode_GET_SIZE(self) == 1 &&
5520 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005521 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005522
5523 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005524 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005525 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005526
5527 e = p + PyUnicode_GET_SIZE(self);
5528 for (; p < e; p++) {
5529 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005530 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005531 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005532 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005533}
5534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005535PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005536"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005537\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005538Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005539and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005540
5541static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005542unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005543{
5544 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5545 register const Py_UNICODE *e;
5546
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005547 /* Shortcut for single character strings */
5548 if (PyUnicode_GET_SIZE(self) == 1 &&
5549 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005550 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005551
5552 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005553 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005554 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005555
5556 e = p + PyUnicode_GET_SIZE(self);
5557 for (; p < e; p++) {
5558 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005559 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005560 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005561 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005562}
5563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005564PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005565"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005567Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005568False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
5570static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005571unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572{
5573 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5574 register const Py_UNICODE *e;
5575
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 /* Shortcut for single character strings */
5577 if (PyUnicode_GET_SIZE(self) == 1 &&
5578 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005579 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005581 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005582 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005583 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005584
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 e = p + PyUnicode_GET_SIZE(self);
5586 for (; p < e; p++) {
5587 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005588 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005590 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591}
5592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005593PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005594"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005596Return True if all characters in S are digits\n\
5597and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598
5599static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005600unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601{
5602 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5603 register const Py_UNICODE *e;
5604
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 /* Shortcut for single character strings */
5606 if (PyUnicode_GET_SIZE(self) == 1 &&
5607 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005608 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005610 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005611 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005612 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005613
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 e = p + PyUnicode_GET_SIZE(self);
5615 for (; p < e; p++) {
5616 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005619 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620}
5621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005622PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005623"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005625Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005626False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627
5628static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005629unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630{
5631 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5632 register const Py_UNICODE *e;
5633
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 /* Shortcut for single character strings */
5635 if (PyUnicode_GET_SIZE(self) == 1 &&
5636 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005637 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005639 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005640 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005641 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005642
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 e = p + PyUnicode_GET_SIZE(self);
5644 for (; p < e; p++) {
5645 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005646 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005648 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649}
5650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005651PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652"S.join(sequence) -> unicode\n\
5653\n\
5654Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005655sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
5657static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005658unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005660 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661}
5662
Martin v. Löwis18e16552006-02-15 17:27:45 +00005663static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664unicode_length(PyUnicodeObject *self)
5665{
5666 return self->length;
5667}
5668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005669PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005670"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671\n\
5672Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005673done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
5675static PyObject *
5676unicode_ljust(PyUnicodeObject *self, PyObject *args)
5677{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005678 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005679 Py_UNICODE fillchar = ' ';
5680
Martin v. Löwis412fb672006-04-13 06:34:32 +00005681 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 return NULL;
5683
Tim Peters7a29bd52001-09-12 03:03:31 +00005684 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 Py_INCREF(self);
5686 return (PyObject*) self;
5687 }
5688
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005689 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690}
5691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005692PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693"S.lower() -> unicode\n\
5694\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005695Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
5697static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005698unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 return fixup(self, fixlower);
5701}
5702
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005703#define LEFTSTRIP 0
5704#define RIGHTSTRIP 1
5705#define BOTHSTRIP 2
5706
5707/* Arrays indexed by above */
5708static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5709
5710#define STRIPNAME(i) (stripformat[i]+3)
5711
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005712/* externally visible for str.strip(unicode) */
5713PyObject *
5714_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5715{
5716 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005718 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005719 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5720 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005721
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005722 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5723
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005724 i = 0;
5725 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005726 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5727 i++;
5728 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005729 }
5730
5731 j = len;
5732 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005733 do {
5734 j--;
5735 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5736 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005737 }
5738
5739 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005740 Py_INCREF(self);
5741 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005742 }
5743 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005744 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005745}
5746
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
5748static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005749do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005751 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005753
5754 i = 0;
5755 if (striptype != RIGHTSTRIP) {
5756 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5757 i++;
5758 }
5759 }
5760
5761 j = len;
5762 if (striptype != LEFTSTRIP) {
5763 do {
5764 j--;
5765 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5766 j++;
5767 }
5768
5769 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5770 Py_INCREF(self);
5771 return (PyObject*)self;
5772 }
5773 else
5774 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775}
5776
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005777
5778static PyObject *
5779do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5780{
5781 PyObject *sep = NULL;
5782
5783 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5784 return NULL;
5785
5786 if (sep != NULL && sep != Py_None) {
5787 if (PyUnicode_Check(sep))
5788 return _PyUnicode_XStrip(self, striptype, sep);
5789 else if (PyString_Check(sep)) {
5790 PyObject *res;
5791 sep = PyUnicode_FromObject(sep);
5792 if (sep==NULL)
5793 return NULL;
5794 res = _PyUnicode_XStrip(self, striptype, sep);
5795 Py_DECREF(sep);
5796 return res;
5797 }
5798 else {
5799 PyErr_Format(PyExc_TypeError,
5800 "%s arg must be None, unicode or str",
5801 STRIPNAME(striptype));
5802 return NULL;
5803 }
5804 }
5805
5806 return do_strip(self, striptype);
5807}
5808
5809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005810PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005811"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005812\n\
5813Return a copy of the string S with leading and trailing\n\
5814whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005815If chars is given and not None, remove characters in chars instead.\n\
5816If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005817
5818static PyObject *
5819unicode_strip(PyUnicodeObject *self, PyObject *args)
5820{
5821 if (PyTuple_GET_SIZE(args) == 0)
5822 return do_strip(self, BOTHSTRIP); /* Common case */
5823 else
5824 return do_argstrip(self, BOTHSTRIP, args);
5825}
5826
5827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005828PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005829"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005830\n\
5831Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005832If chars is given and not None, remove characters in chars instead.\n\
5833If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005834
5835static PyObject *
5836unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5837{
5838 if (PyTuple_GET_SIZE(args) == 0)
5839 return do_strip(self, LEFTSTRIP); /* Common case */
5840 else
5841 return do_argstrip(self, LEFTSTRIP, args);
5842}
5843
5844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005845PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005846"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005847\n\
5848Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005849If chars is given and not None, remove characters in chars instead.\n\
5850If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005851
5852static PyObject *
5853unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5854{
5855 if (PyTuple_GET_SIZE(args) == 0)
5856 return do_strip(self, RIGHTSTRIP); /* Common case */
5857 else
5858 return do_argstrip(self, RIGHTSTRIP, args);
5859}
5860
5861
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
5865 PyUnicodeObject *u;
5866 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005867 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005868 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870 if (len < 0)
5871 len = 0;
5872
Tim Peters7a29bd52001-09-12 03:03:31 +00005873 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 /* no repeat, return original string */
5875 Py_INCREF(str);
5876 return (PyObject*) str;
5877 }
Tim Peters8f422462000-09-09 06:13:41 +00005878
5879 /* ensure # of chars needed doesn't overflow int and # of bytes
5880 * needed doesn't overflow size_t
5881 */
5882 nchars = len * str->length;
5883 if (len && nchars / len != str->length) {
5884 PyErr_SetString(PyExc_OverflowError,
5885 "repeated string is too long");
5886 return NULL;
5887 }
5888 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5889 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5890 PyErr_SetString(PyExc_OverflowError,
5891 "repeated string is too long");
5892 return NULL;
5893 }
5894 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 if (!u)
5896 return NULL;
5897
5898 p = u->str;
5899
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005900 if (str->length == 1 && len > 0) {
5901 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005902 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00005903 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005904 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005905 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005906 done = str->length;
5907 }
5908 while (done < nchars) {
5909 int n = (done <= nchars-done) ? done : nchars-done;
5910 Py_UNICODE_COPY(p+done, p, n);
5911 done += n;
5912 }
5913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
5915 return (PyObject*) u;
5916}
5917
5918PyObject *PyUnicode_Replace(PyObject *obj,
5919 PyObject *subobj,
5920 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
5923 PyObject *self;
5924 PyObject *str1;
5925 PyObject *str2;
5926 PyObject *result;
5927
5928 self = PyUnicode_FromObject(obj);
5929 if (self == NULL)
5930 return NULL;
5931 str1 = PyUnicode_FromObject(subobj);
5932 if (str1 == NULL) {
5933 Py_DECREF(self);
5934 return NULL;
5935 }
5936 str2 = PyUnicode_FromObject(replobj);
5937 if (str2 == NULL) {
5938 Py_DECREF(self);
5939 Py_DECREF(str1);
5940 return NULL;
5941 }
Tim Petersced69f82003-09-16 20:30:58 +00005942 result = replace((PyUnicodeObject *)self,
5943 (PyUnicodeObject *)str1,
5944 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 maxcount);
5946 Py_DECREF(self);
5947 Py_DECREF(str1);
5948 Py_DECREF(str2);
5949 return result;
5950}
5951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005952PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953"S.replace (old, new[, maxsplit]) -> unicode\n\
5954\n\
5955Return a copy of S with all occurrences of substring\n\
5956old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005957given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958
5959static PyObject*
5960unicode_replace(PyUnicodeObject *self, PyObject *args)
5961{
5962 PyUnicodeObject *str1;
5963 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005964 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 PyObject *result;
5966
Martin v. Löwis18e16552006-02-15 17:27:45 +00005967 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 return NULL;
5969 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5970 if (str1 == NULL)
5971 return NULL;
5972 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005973 if (str2 == NULL) {
5974 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
5978 result = replace(self, str1, str2, maxcount);
5979
5980 Py_DECREF(str1);
5981 Py_DECREF(str2);
5982 return result;
5983}
5984
5985static
5986PyObject *unicode_repr(PyObject *unicode)
5987{
5988 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5989 PyUnicode_GET_SIZE(unicode),
5990 1);
5991}
5992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005993PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994"S.rfind(sub [,start [,end]]) -> int\n\
5995\n\
5996Return the highest index in S where substring sub is found,\n\
5997such that sub is contained within s[start,end]. Optional\n\
5998arguments start and end are interpreted as in slice notation.\n\
5999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006000Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001
6002static PyObject *
6003unicode_rfind(PyUnicodeObject *self, PyObject *args)
6004{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006005 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006006 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006007 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006008 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Guido van Rossumb8872e62000-05-09 14:14:27 +00006010 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6011 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006013 substring = PyUnicode_FromObject(substring);
6014 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 return NULL;
6016
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006017 FIX_START_END(self);
6018
6019 result = stringlib_rfind_obj((PyObject*)self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006022
6023 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027"S.rindex(sub [,start [,end]]) -> int\n\
6028\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006029Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
6031static PyObject *
6032unicode_rindex(PyUnicodeObject *self, PyObject *args)
6033{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006034 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006035 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006036 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006037 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Guido van Rossumb8872e62000-05-09 14:14:27 +00006039 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6040 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006042 substring = PyUnicode_FromObject(substring);
6043 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 return NULL;
6045
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006046 FIX_START_END(self);
6047
6048 result = stringlib_rfind_obj((PyObject*)self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049
6050 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 if (result < 0) {
6053 PyErr_SetString(PyExc_ValueError, "substring not found");
6054 return NULL;
6055 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006056 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006059PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006060"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061\n\
6062Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006063done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
6065static PyObject *
6066unicode_rjust(PyUnicodeObject *self, PyObject *args)
6067{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006068 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006069 Py_UNICODE fillchar = ' ';
6070
Martin v. Löwis412fb672006-04-13 06:34:32 +00006071 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 return NULL;
6073
Tim Peters7a29bd52001-09-12 03:03:31 +00006074 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 Py_INCREF(self);
6076 return (PyObject*) self;
6077 }
6078
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006079 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080}
6081
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006083unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084{
6085 /* standard clamping */
6086 if (start < 0)
6087 start = 0;
6088 if (end < 0)
6089 end = 0;
6090 if (end > self->length)
6091 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006092 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 /* full slice, return original string */
6094 Py_INCREF(self);
6095 return (PyObject*) self;
6096 }
6097 if (start > end)
6098 start = end;
6099 /* copy slice */
6100 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6101 end - start);
6102}
6103
6104PyObject *PyUnicode_Split(PyObject *s,
6105 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006106 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107{
6108 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006109
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 s = PyUnicode_FromObject(s);
6111 if (s == NULL)
6112 return NULL;
6113 if (sep != NULL) {
6114 sep = PyUnicode_FromObject(sep);
6115 if (sep == NULL) {
6116 Py_DECREF(s);
6117 return NULL;
6118 }
6119 }
6120
6121 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6122
6123 Py_DECREF(s);
6124 Py_XDECREF(sep);
6125 return result;
6126}
6127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006128PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129"S.split([sep [,maxsplit]]) -> list of strings\n\
6130\n\
6131Return a list of the words in S, using sep as the\n\
6132delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006133splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006134any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
6136static PyObject*
6137unicode_split(PyUnicodeObject *self, PyObject *args)
6138{
6139 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006140 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141
Martin v. Löwis18e16552006-02-15 17:27:45 +00006142 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 return NULL;
6144
6145 if (substring == Py_None)
6146 return split(self, NULL, maxcount);
6147 else if (PyUnicode_Check(substring))
6148 return split(self, (PyUnicodeObject *)substring, maxcount);
6149 else
6150 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6151}
6152
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006153PyObject *
6154PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6155{
6156 PyObject* str_obj;
6157 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006158 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006159
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006160 str_obj = PyUnicode_FromObject(str_in);
6161 if (!str_obj)
6162 return NULL;
6163 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006164 if (!sep_obj) {
6165 Py_DECREF(str_obj);
6166 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006167 }
6168
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006169 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006170 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6171 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6172 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006173
Fredrik Lundhb9479482006-05-26 17:22:38 +00006174 Py_DECREF(sep_obj);
6175 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006176
6177 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006178}
6179
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006180
6181PyObject *
6182PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6183{
6184 PyObject* str_obj;
6185 PyObject* sep_obj;
6186 PyObject* out;
6187
6188 str_obj = PyUnicode_FromObject(str_in);
6189 if (!str_obj)
6190 return NULL;
6191 sep_obj = PyUnicode_FromObject(sep_in);
6192 if (!sep_obj) {
6193 Py_DECREF(str_obj);
6194 return NULL;
6195 }
6196
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006197 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006198 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6199 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6200 );
6201
6202 Py_DECREF(sep_obj);
6203 Py_DECREF(str_obj);
6204
6205 return out;
6206}
6207
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006208PyDoc_STRVAR(partition__doc__,
6209"S.partition(sep) -> (head, sep, tail)\n\
6210\n\
6211Searches for the separator sep in S, and returns the part before it,\n\
6212the separator itself, and the part after it. If the separator is not\n\
6213found, returns S and two empty strings.");
6214
6215static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006216unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006217{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006218 return PyUnicode_Partition((PyObject *)self, separator);
6219}
6220
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006221PyDoc_STRVAR(rpartition__doc__,
6222"S.rpartition(sep) -> (head, sep, tail)\n\
6223\n\
6224Searches for the separator sep in S, starting at the end of S, and returns\n\
6225the part before it, the separator itself, and the part after it. If the\n\
6226separator is not found, returns S and two empty strings.");
6227
6228static PyObject*
6229unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6230{
6231 return PyUnicode_RPartition((PyObject *)self, separator);
6232}
6233
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006234PyObject *PyUnicode_RSplit(PyObject *s,
6235 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006236 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006237{
6238 PyObject *result;
6239
6240 s = PyUnicode_FromObject(s);
6241 if (s == NULL)
6242 return NULL;
6243 if (sep != NULL) {
6244 sep = PyUnicode_FromObject(sep);
6245 if (sep == NULL) {
6246 Py_DECREF(s);
6247 return NULL;
6248 }
6249 }
6250
6251 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6252
6253 Py_DECREF(s);
6254 Py_XDECREF(sep);
6255 return result;
6256}
6257
6258PyDoc_STRVAR(rsplit__doc__,
6259"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6260\n\
6261Return a list of the words in S, using sep as the\n\
6262delimiter string, starting at the end of the string and\n\
6263working to the front. If maxsplit is given, at most maxsplit\n\
6264splits are done. If sep is not specified, any whitespace string\n\
6265is a separator.");
6266
6267static PyObject*
6268unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6269{
6270 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006271 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006272
Martin v. Löwis18e16552006-02-15 17:27:45 +00006273 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006274 return NULL;
6275
6276 if (substring == Py_None)
6277 return rsplit(self, NULL, maxcount);
6278 else if (PyUnicode_Check(substring))
6279 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6280 else
6281 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6282}
6283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006284PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006285"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286\n\
6287Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006288Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006289is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290
6291static PyObject*
6292unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6293{
Guido van Rossum86662912000-04-11 15:38:46 +00006294 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
Guido van Rossum86662912000-04-11 15:38:46 +00006296 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 return NULL;
6298
Guido van Rossum86662912000-04-11 15:38:46 +00006299 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300}
6301
6302static
6303PyObject *unicode_str(PyUnicodeObject *self)
6304{
Fred Drakee4315f52000-05-09 19:53:39 +00006305 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306}
6307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006308PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309"S.swapcase() -> unicode\n\
6310\n\
6311Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006312and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
6314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006315unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 return fixup(self, fixswapcase);
6318}
6319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006320PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321"S.translate(table) -> unicode\n\
6322\n\
6323Return a copy of the string S, where all characters have been mapped\n\
6324through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006325Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6326Unmapped characters are left untouched. Characters mapped to None\n\
6327are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
6329static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006330unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331{
Tim Petersced69f82003-09-16 20:30:58 +00006332 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006334 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 "ignore");
6336}
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339"S.upper() -> unicode\n\
6340\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006341Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
6343static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006344unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 return fixup(self, fixupper);
6347}
6348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006349PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350"S.zfill(width) -> unicode\n\
6351\n\
6352Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006353of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
6355static PyObject *
6356unicode_zfill(PyUnicodeObject *self, PyObject *args)
6357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 PyUnicodeObject *u;
6360
Martin v. Löwis18e16552006-02-15 17:27:45 +00006361 Py_ssize_t width;
6362 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 return NULL;
6364
6365 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006366 if (PyUnicode_CheckExact(self)) {
6367 Py_INCREF(self);
6368 return (PyObject*) self;
6369 }
6370 else
6371 return PyUnicode_FromUnicode(
6372 PyUnicode_AS_UNICODE(self),
6373 PyUnicode_GET_SIZE(self)
6374 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 }
6376
6377 fill = width - self->length;
6378
6379 u = pad(self, fill, 0, '0');
6380
Walter Dörwald068325e2002-04-15 13:36:47 +00006381 if (u == NULL)
6382 return NULL;
6383
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 if (u->str[fill] == '+' || u->str[fill] == '-') {
6385 /* move sign to beginning of string */
6386 u->str[0] = u->str[fill];
6387 u->str[fill] = '0';
6388 }
6389
6390 return (PyObject*) u;
6391}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392
6393#if 0
6394static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006395unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 return PyInt_FromLong(unicode_freelist_size);
6398}
6399#endif
6400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006401PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006402"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006404Return True if S starts with the specified prefix, False otherwise.\n\
6405With optional start, test S beginning at that position.\n\
6406With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407
6408static PyObject *
6409unicode_startswith(PyUnicodeObject *self,
6410 PyObject *args)
6411{
6412 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006413 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006414 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 PyObject *result;
6416
Guido van Rossumb8872e62000-05-09 14:14:27 +00006417 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6418 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 return NULL;
6420 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6421 (PyObject *)substring);
6422 if (substring == NULL)
6423 return NULL;
6424
Guido van Rossum77f6a652002-04-03 22:41:51 +00006425 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426
6427 Py_DECREF(substring);
6428 return result;
6429}
6430
6431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006432PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006433"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006435Return True if S ends with the specified suffix, False otherwise.\n\
6436With optional start, test S beginning at that position.\n\
6437With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438
6439static PyObject *
6440unicode_endswith(PyUnicodeObject *self,
6441 PyObject *args)
6442{
6443 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006444 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006445 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 PyObject *result;
6447
Guido van Rossumb8872e62000-05-09 14:14:27 +00006448 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6449 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 return NULL;
6451 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6452 (PyObject *)substring);
6453 if (substring == NULL)
6454 return NULL;
6455
Guido van Rossum77f6a652002-04-03 22:41:51 +00006456 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457
6458 Py_DECREF(substring);
6459 return result;
6460}
6461
6462
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006463
6464static PyObject *
6465unicode_getnewargs(PyUnicodeObject *v)
6466{
6467 return Py_BuildValue("(u#)", v->str, v->length);
6468}
6469
6470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471static PyMethodDef unicode_methods[] = {
6472
6473 /* Order is according to common usage: often used methods should
6474 appear first, since lookup is done sequentially. */
6475
Georg Brandlecdc0a92006-03-30 12:19:07 +00006476 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006477 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6478 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006479 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006480 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6481 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6482 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6483 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6484 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6485 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6486 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00006487 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006488 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6489 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6490 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006491 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006492 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006493/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6494 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6495 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6496 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006497 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006498 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006499 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006500 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006501 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6502 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6503 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6504 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6505 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6506 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6507 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6508 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6509 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6510 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6511 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6512 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6513 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6514 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006515 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006516#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006517 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518#endif
6519
6520#if 0
6521 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006522 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523#endif
6524
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006525 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 {NULL, NULL}
6527};
6528
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006529static PyObject *
6530unicode_mod(PyObject *v, PyObject *w)
6531{
6532 if (!PyUnicode_Check(v)) {
6533 Py_INCREF(Py_NotImplemented);
6534 return Py_NotImplemented;
6535 }
6536 return PyUnicode_Format(v, w);
6537}
6538
6539static PyNumberMethods unicode_as_number = {
6540 0, /*nb_add*/
6541 0, /*nb_subtract*/
6542 0, /*nb_multiply*/
6543 0, /*nb_divide*/
6544 unicode_mod, /*nb_remainder*/
6545};
6546
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006548 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006549 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6551 (ssizeargfunc) unicode_getitem, /* sq_item */
6552 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 0, /* sq_ass_item */
6554 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006555 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556};
6557
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006558#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6559
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006560static PyObject*
6561unicode_subscript(PyUnicodeObject* self, PyObject* item)
6562{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006563 PyNumberMethods *nb = item->ob_type->tp_as_number;
6564 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6565 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006566 if (i == -1 && PyErr_Occurred())
6567 return NULL;
6568 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006569 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006570 return unicode_getitem(self, i);
6571 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006573 Py_UNICODE* source_buf;
6574 Py_UNICODE* result_buf;
6575 PyObject* result;
6576
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006577 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006578 &start, &stop, &step, &slicelength) < 0) {
6579 return NULL;
6580 }
6581
6582 if (slicelength <= 0) {
6583 return PyUnicode_FromUnicode(NULL, 0);
6584 } else {
6585 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006586 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6587 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006588
6589 if (result_buf == NULL)
6590 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006591
6592 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6593 result_buf[i] = source_buf[cur];
6594 }
Tim Petersced69f82003-09-16 20:30:58 +00006595
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006596 result = PyUnicode_FromUnicode(result_buf, slicelength);
6597 PyMem_FREE(result_buf);
6598 return result;
6599 }
6600 } else {
6601 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6602 return NULL;
6603 }
6604}
6605
6606static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006607 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006608 (binaryfunc)unicode_subscript, /* mp_subscript */
6609 (objobjargproc)0, /* mp_ass_subscript */
6610};
6611
Martin v. Löwis18e16552006-02-15 17:27:45 +00006612static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006614 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 const void **ptr)
6616{
6617 if (index != 0) {
6618 PyErr_SetString(PyExc_SystemError,
6619 "accessing non-existent unicode segment");
6620 return -1;
6621 }
6622 *ptr = (void *) self->str;
6623 return PyUnicode_GET_DATA_SIZE(self);
6624}
6625
Martin v. Löwis18e16552006-02-15 17:27:45 +00006626static Py_ssize_t
6627unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 const void **ptr)
6629{
6630 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006631 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 return -1;
6633}
6634
6635static int
6636unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006637 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638{
6639 if (lenp)
6640 *lenp = PyUnicode_GET_DATA_SIZE(self);
6641 return 1;
6642}
6643
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006644static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006646 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 const void **ptr)
6648{
6649 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 if (index != 0) {
6652 PyErr_SetString(PyExc_SystemError,
6653 "accessing non-existent unicode segment");
6654 return -1;
6655 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006656 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 if (str == NULL)
6658 return -1;
6659 *ptr = (void *) PyString_AS_STRING(str);
6660 return PyString_GET_SIZE(str);
6661}
6662
6663/* Helpers for PyUnicode_Format() */
6664
6665static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006666getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006668 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 if (argidx < arglen) {
6670 (*p_argidx)++;
6671 if (arglen < 0)
6672 return args;
6673 else
6674 return PyTuple_GetItem(args, argidx);
6675 }
6676 PyErr_SetString(PyExc_TypeError,
6677 "not enough arguments for format string");
6678 return NULL;
6679}
6680
6681#define F_LJUST (1<<0)
6682#define F_SIGN (1<<1)
6683#define F_BLANK (1<<2)
6684#define F_ALT (1<<3)
6685#define F_ZERO (1<<4)
6686
Martin v. Löwis18e16552006-02-15 17:27:45 +00006687static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006688strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006690 register Py_ssize_t i;
6691 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 for (i = len - 1; i >= 0; i--)
6693 buffer[i] = (Py_UNICODE) charbuffer[i];
6694
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 return len;
6696}
6697
Neal Norwitzfc76d632006-01-10 06:03:13 +00006698static int
6699doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6700{
Tim Peters15231542006-02-16 01:08:01 +00006701 Py_ssize_t result;
6702
Neal Norwitzfc76d632006-01-10 06:03:13 +00006703 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006704 result = strtounicode(buffer, (char *)buffer);
6705 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006706}
6707
6708static int
6709longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6710{
Tim Peters15231542006-02-16 01:08:01 +00006711 Py_ssize_t result;
6712
Neal Norwitzfc76d632006-01-10 06:03:13 +00006713 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006714 result = strtounicode(buffer, (char *)buffer);
6715 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006716}
6717
Guido van Rossum078151d2002-08-11 04:24:12 +00006718/* XXX To save some code duplication, formatfloat/long/int could have been
6719 shared with stringobject.c, converting from 8-bit to Unicode after the
6720 formatting is done. */
6721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722static int
6723formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006724 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 int flags,
6726 int prec,
6727 int type,
6728 PyObject *v)
6729{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006730 /* fmt = '%#.' + `prec` + `type`
6731 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 char fmt[20];
6733 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 x = PyFloat_AsDouble(v);
6736 if (x == -1.0 && PyErr_Occurred())
6737 return -1;
6738 if (prec < 0)
6739 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6741 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006742 /* Worst case length calc to ensure no buffer overrun:
6743
6744 'g' formats:
6745 fmt = %#.<prec>g
6746 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6747 for any double rep.)
6748 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6749
6750 'f' formats:
6751 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6752 len = 1 + 50 + 1 + prec = 52 + prec
6753
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006754 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006755 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006756
6757 */
6758 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6759 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006760 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006761 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006762 return -1;
6763 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006764 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6765 (flags&F_ALT) ? "#" : "",
6766 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006767 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768}
6769
Tim Peters38fd5b62000-09-21 05:43:11 +00006770static PyObject*
6771formatlong(PyObject *val, int flags, int prec, int type)
6772{
6773 char *buf;
6774 int i, len;
6775 PyObject *str; /* temporary string object. */
6776 PyUnicodeObject *result;
6777
6778 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6779 if (!str)
6780 return NULL;
6781 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006782 if (!result) {
6783 Py_DECREF(str);
6784 return NULL;
6785 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006786 for (i = 0; i < len; i++)
6787 result->str[i] = buf[i];
6788 result->str[len] = 0;
6789 Py_DECREF(str);
6790 return (PyObject*)result;
6791}
6792
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793static int
6794formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006795 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 int flags,
6797 int prec,
6798 int type,
6799 PyObject *v)
6800{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006801 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006802 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6803 * + 1 + 1
6804 * = 24
6805 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006806 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006807 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 long x;
6809
6810 x = PyInt_AsLong(v);
6811 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006812 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006813 if (x < 0 && type == 'u') {
6814 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006815 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006816 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6817 sign = "-";
6818 else
6819 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006821 prec = 1;
6822
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006823 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6824 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006825 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006826 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006827 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006828 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006829 return -1;
6830 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006831
6832 if ((flags & F_ALT) &&
6833 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006834 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006835 * of issues that cause pain:
6836 * - when 0 is being converted, the C standard leaves off
6837 * the '0x' or '0X', which is inconsistent with other
6838 * %#x/%#X conversions and inconsistent with Python's
6839 * hex() function
6840 * - there are platforms that violate the standard and
6841 * convert 0 with the '0x' or '0X'
6842 * (Metrowerks, Compaq Tru64)
6843 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006844 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006845 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006846 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006847 * We can achieve the desired consistency by inserting our
6848 * own '0x' or '0X' prefix, and substituting %x/%X in place
6849 * of %#x/%#X.
6850 *
6851 * Note that this is the same approach as used in
6852 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006853 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006854 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6855 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006856 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006857 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006858 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6859 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006860 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006861 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006862 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006863 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006864 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006865 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866}
6867
6868static int
6869formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006870 size_t buflen,
6871 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006873 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006874 if (PyUnicode_Check(v)) {
6875 if (PyUnicode_GET_SIZE(v) != 1)
6876 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006880 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006881 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006882 goto onError;
6883 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
6886 else {
6887 /* Integer input truncated to a character */
6888 long x;
6889 x = PyInt_AsLong(v);
6890 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006891 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006892#ifdef Py_UNICODE_WIDE
6893 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006894 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006895 "%c arg not in range(0x110000) "
6896 "(wide Python build)");
6897 return -1;
6898 }
6899#else
6900 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006901 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006902 "%c arg not in range(0x10000) "
6903 "(narrow Python build)");
6904 return -1;
6905 }
6906#endif
6907 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 }
6909 buf[1] = '\0';
6910 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006911
6912 onError:
6913 PyErr_SetString(PyExc_TypeError,
6914 "%c requires int or char");
6915 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916}
6917
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006918/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6919
6920 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6921 chars are formatted. XXX This is a magic number. Each formatting
6922 routine does bounds checking to ensure no overflow, but a better
6923 solution may be to malloc a buffer of appropriate size for each
6924 format. For now, the current solution is sufficient.
6925*/
6926#define FORMATBUFLEN (size_t)120
6927
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928PyObject *PyUnicode_Format(PyObject *format,
6929 PyObject *args)
6930{
6931 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006932 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 int args_owned = 0;
6934 PyUnicodeObject *result = NULL;
6935 PyObject *dict = NULL;
6936 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 if (format == NULL || args == NULL) {
6939 PyErr_BadInternalCall();
6940 return NULL;
6941 }
6942 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006943 if (uformat == NULL)
6944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 fmt = PyUnicode_AS_UNICODE(uformat);
6946 fmtcnt = PyUnicode_GET_SIZE(uformat);
6947
6948 reslen = rescnt = fmtcnt + 100;
6949 result = _PyUnicode_New(reslen);
6950 if (result == NULL)
6951 goto onError;
6952 res = PyUnicode_AS_UNICODE(result);
6953
6954 if (PyTuple_Check(args)) {
6955 arglen = PyTuple_Size(args);
6956 argidx = 0;
6957 }
6958 else {
6959 arglen = -1;
6960 argidx = -2;
6961 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006962 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6963 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 dict = args;
6965
6966 while (--fmtcnt >= 0) {
6967 if (*fmt != '%') {
6968 if (--rescnt < 0) {
6969 rescnt = fmtcnt + 100;
6970 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006971 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006972 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6974 --rescnt;
6975 }
6976 *res++ = *fmt++;
6977 }
6978 else {
6979 /* Got a format specifier */
6980 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006981 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 Py_UNICODE c = '\0';
6984 Py_UNICODE fill;
6985 PyObject *v = NULL;
6986 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006987 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006989 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006990 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
6992 fmt++;
6993 if (*fmt == '(') {
6994 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006995 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 PyObject *key;
6997 int pcount = 1;
6998
6999 if (dict == NULL) {
7000 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007001 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 goto onError;
7003 }
7004 ++fmt;
7005 --fmtcnt;
7006 keystart = fmt;
7007 /* Skip over balanced parentheses */
7008 while (pcount > 0 && --fmtcnt >= 0) {
7009 if (*fmt == ')')
7010 --pcount;
7011 else if (*fmt == '(')
7012 ++pcount;
7013 fmt++;
7014 }
7015 keylen = fmt - keystart - 1;
7016 if (fmtcnt < 0 || pcount > 0) {
7017 PyErr_SetString(PyExc_ValueError,
7018 "incomplete format key");
7019 goto onError;
7020 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007021#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007022 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 then looked up since Python uses strings to hold
7024 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007025 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 key = PyUnicode_EncodeUTF8(keystart,
7027 keylen,
7028 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007029#else
7030 key = PyUnicode_FromUnicode(keystart, keylen);
7031#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 if (key == NULL)
7033 goto onError;
7034 if (args_owned) {
7035 Py_DECREF(args);
7036 args_owned = 0;
7037 }
7038 args = PyObject_GetItem(dict, key);
7039 Py_DECREF(key);
7040 if (args == NULL) {
7041 goto onError;
7042 }
7043 args_owned = 1;
7044 arglen = -1;
7045 argidx = -2;
7046 }
7047 while (--fmtcnt >= 0) {
7048 switch (c = *fmt++) {
7049 case '-': flags |= F_LJUST; continue;
7050 case '+': flags |= F_SIGN; continue;
7051 case ' ': flags |= F_BLANK; continue;
7052 case '#': flags |= F_ALT; continue;
7053 case '0': flags |= F_ZERO; continue;
7054 }
7055 break;
7056 }
7057 if (c == '*') {
7058 v = getnextarg(args, arglen, &argidx);
7059 if (v == NULL)
7060 goto onError;
7061 if (!PyInt_Check(v)) {
7062 PyErr_SetString(PyExc_TypeError,
7063 "* wants int");
7064 goto onError;
7065 }
7066 width = PyInt_AsLong(v);
7067 if (width < 0) {
7068 flags |= F_LJUST;
7069 width = -width;
7070 }
7071 if (--fmtcnt >= 0)
7072 c = *fmt++;
7073 }
7074 else if (c >= '0' && c <= '9') {
7075 width = c - '0';
7076 while (--fmtcnt >= 0) {
7077 c = *fmt++;
7078 if (c < '0' || c > '9')
7079 break;
7080 if ((width*10) / 10 != width) {
7081 PyErr_SetString(PyExc_ValueError,
7082 "width too big");
7083 goto onError;
7084 }
7085 width = width*10 + (c - '0');
7086 }
7087 }
7088 if (c == '.') {
7089 prec = 0;
7090 if (--fmtcnt >= 0)
7091 c = *fmt++;
7092 if (c == '*') {
7093 v = getnextarg(args, arglen, &argidx);
7094 if (v == NULL)
7095 goto onError;
7096 if (!PyInt_Check(v)) {
7097 PyErr_SetString(PyExc_TypeError,
7098 "* wants int");
7099 goto onError;
7100 }
7101 prec = PyInt_AsLong(v);
7102 if (prec < 0)
7103 prec = 0;
7104 if (--fmtcnt >= 0)
7105 c = *fmt++;
7106 }
7107 else if (c >= '0' && c <= '9') {
7108 prec = c - '0';
7109 while (--fmtcnt >= 0) {
7110 c = Py_CHARMASK(*fmt++);
7111 if (c < '0' || c > '9')
7112 break;
7113 if ((prec*10) / 10 != prec) {
7114 PyErr_SetString(PyExc_ValueError,
7115 "prec too big");
7116 goto onError;
7117 }
7118 prec = prec*10 + (c - '0');
7119 }
7120 }
7121 } /* prec */
7122 if (fmtcnt >= 0) {
7123 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 if (--fmtcnt >= 0)
7125 c = *fmt++;
7126 }
7127 }
7128 if (fmtcnt < 0) {
7129 PyErr_SetString(PyExc_ValueError,
7130 "incomplete format");
7131 goto onError;
7132 }
7133 if (c != '%') {
7134 v = getnextarg(args, arglen, &argidx);
7135 if (v == NULL)
7136 goto onError;
7137 }
7138 sign = 0;
7139 fill = ' ';
7140 switch (c) {
7141
7142 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007143 pbuf = formatbuf;
7144 /* presume that buffer length is at least 1 */
7145 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 len = 1;
7147 break;
7148
7149 case 's':
7150 case 'r':
7151 if (PyUnicode_Check(v) && c == 's') {
7152 temp = v;
7153 Py_INCREF(temp);
7154 }
7155 else {
7156 PyObject *unicode;
7157 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007158 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 else
7160 temp = PyObject_Repr(v);
7161 if (temp == NULL)
7162 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007163 if (PyUnicode_Check(temp))
7164 /* nothing to do */;
7165 else if (PyString_Check(temp)) {
7166 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007167 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007169 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007171 Py_DECREF(temp);
7172 temp = unicode;
7173 if (temp == NULL)
7174 goto onError;
7175 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007176 else {
7177 Py_DECREF(temp);
7178 PyErr_SetString(PyExc_TypeError,
7179 "%s argument has non-string str()");
7180 goto onError;
7181 }
7182 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007183 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 len = PyUnicode_GET_SIZE(temp);
7185 if (prec >= 0 && len > prec)
7186 len = prec;
7187 break;
7188
7189 case 'i':
7190 case 'd':
7191 case 'u':
7192 case 'o':
7193 case 'x':
7194 case 'X':
7195 if (c == 'i')
7196 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007197 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007198 temp = formatlong(v, flags, prec, c);
7199 if (!temp)
7200 goto onError;
7201 pbuf = PyUnicode_AS_UNICODE(temp);
7202 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007203 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007205 else {
7206 pbuf = formatbuf;
7207 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7208 flags, prec, c, v);
7209 if (len < 0)
7210 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007211 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007212 }
7213 if (flags & F_ZERO)
7214 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 break;
7216
7217 case 'e':
7218 case 'E':
7219 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007220 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 case 'g':
7222 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007223 if (c == 'F')
7224 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007225 pbuf = formatbuf;
7226 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7227 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 if (len < 0)
7229 goto onError;
7230 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007231 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 fill = '0';
7233 break;
7234
7235 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007236 pbuf = formatbuf;
7237 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 if (len < 0)
7239 goto onError;
7240 break;
7241
7242 default:
7243 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007244 "unsupported format character '%c' (0x%x) "
7245 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007246 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007247 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007248 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 goto onError;
7250 }
7251 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007252 if (*pbuf == '-' || *pbuf == '+') {
7253 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 len--;
7255 }
7256 else if (flags & F_SIGN)
7257 sign = '+';
7258 else if (flags & F_BLANK)
7259 sign = ' ';
7260 else
7261 sign = 0;
7262 }
7263 if (width < len)
7264 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007265 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 reslen -= rescnt;
7267 rescnt = width + fmtcnt + 100;
7268 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007269 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007270 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007271 PyErr_NoMemory();
7272 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007273 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007274 if (_PyUnicode_Resize(&result, reslen) < 0) {
7275 Py_XDECREF(temp);
7276 goto onError;
7277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 res = PyUnicode_AS_UNICODE(result)
7279 + reslen - rescnt;
7280 }
7281 if (sign) {
7282 if (fill != ' ')
7283 *res++ = sign;
7284 rescnt--;
7285 if (width > len)
7286 width--;
7287 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007288 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7289 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007290 assert(pbuf[1] == c);
7291 if (fill != ' ') {
7292 *res++ = *pbuf++;
7293 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007294 }
Tim Petersfff53252001-04-12 18:38:48 +00007295 rescnt -= 2;
7296 width -= 2;
7297 if (width < 0)
7298 width = 0;
7299 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 if (width > len && !(flags & F_LJUST)) {
7302 do {
7303 --rescnt;
7304 *res++ = fill;
7305 } while (--width > len);
7306 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007307 if (fill == ' ') {
7308 if (sign)
7309 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007310 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007311 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007312 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007313 *res++ = *pbuf++;
7314 *res++ = *pbuf++;
7315 }
7316 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007317 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 res += len;
7319 rescnt -= len;
7320 while (--width >= len) {
7321 --rescnt;
7322 *res++ = ' ';
7323 }
7324 if (dict && (argidx < arglen) && c != '%') {
7325 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007326 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007327 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 goto onError;
7329 }
7330 Py_XDECREF(temp);
7331 } /* '%' */
7332 } /* until end */
7333 if (argidx < arglen && !dict) {
7334 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007335 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 goto onError;
7337 }
7338
Thomas Woutersa96affe2006-03-12 00:29:36 +00007339 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7340 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 if (args_owned) {
7342 Py_DECREF(args);
7343 }
7344 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 return (PyObject *)result;
7346
7347 onError:
7348 Py_XDECREF(result);
7349 Py_DECREF(uformat);
7350 if (args_owned) {
7351 Py_DECREF(args);
7352 }
7353 return NULL;
7354}
7355
7356static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007357 (readbufferproc) unicode_buffer_getreadbuf,
7358 (writebufferproc) unicode_buffer_getwritebuf,
7359 (segcountproc) unicode_buffer_getsegcount,
7360 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361};
7362
Jeremy Hylton938ace62002-07-17 16:30:39 +00007363static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007364unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7365
Tim Peters6d6c1a32001-08-02 04:15:00 +00007366static PyObject *
7367unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7368{
7369 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007370 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007371 char *encoding = NULL;
7372 char *errors = NULL;
7373
Guido van Rossume023fe02001-08-30 03:12:59 +00007374 if (type != &PyUnicode_Type)
7375 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007376 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7377 kwlist, &x, &encoding, &errors))
7378 return NULL;
7379 if (x == NULL)
7380 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007381 if (encoding == NULL && errors == NULL)
7382 return PyObject_Unicode(x);
7383 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007384 return PyUnicode_FromEncodedObject(x, encoding, errors);
7385}
7386
Guido van Rossume023fe02001-08-30 03:12:59 +00007387static PyObject *
7388unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7389{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007390 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007392
7393 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7394 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7395 if (tmp == NULL)
7396 return NULL;
7397 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007398 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007399 if (pnew == NULL) {
7400 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007401 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007402 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007403 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7404 if (pnew->str == NULL) {
7405 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007406 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007407 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007408 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007409 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007410 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7411 pnew->length = n;
7412 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007413 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007414 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007415}
7416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007417PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007418"unicode(string [, encoding[, errors]]) -> object\n\
7419\n\
7420Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007421encoding defaults to the current default string encoding.\n\
7422errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007423
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424PyTypeObject PyUnicode_Type = {
7425 PyObject_HEAD_INIT(&PyType_Type)
7426 0, /* ob_size */
7427 "unicode", /* tp_name */
7428 sizeof(PyUnicodeObject), /* tp_size */
7429 0, /* tp_itemsize */
7430 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007431 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007433 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 0, /* tp_setattr */
7435 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007436 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007437 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007439 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 (hashfunc) unicode_hash, /* tp_hash*/
7441 0, /* tp_call*/
7442 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007443 PyObject_GenericGetAttr, /* tp_getattro */
7444 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007446 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7447 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007448 unicode_doc, /* tp_doc */
7449 0, /* tp_traverse */
7450 0, /* tp_clear */
7451 0, /* tp_richcompare */
7452 0, /* tp_weaklistoffset */
7453 0, /* tp_iter */
7454 0, /* tp_iternext */
7455 unicode_methods, /* tp_methods */
7456 0, /* tp_members */
7457 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007458 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007459 0, /* tp_dict */
7460 0, /* tp_descr_get */
7461 0, /* tp_descr_set */
7462 0, /* tp_dictoffset */
7463 0, /* tp_init */
7464 0, /* tp_alloc */
7465 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007466 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467};
7468
7469/* Initialize the Unicode implementation */
7470
Thomas Wouters78890102000-07-22 19:25:51 +00007471void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007473 int i;
7474
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007475 /* XXX - move this array to unicodectype.c ? */
7476 Py_UNICODE linebreak[] = {
7477 0x000A, /* LINE FEED */
7478 0x000D, /* CARRIAGE RETURN */
7479 0x001C, /* FILE SEPARATOR */
7480 0x001D, /* GROUP SEPARATOR */
7481 0x001E, /* RECORD SEPARATOR */
7482 0x0085, /* NEXT LINE */
7483 0x2028, /* LINE SEPARATOR */
7484 0x2029, /* PARAGRAPH SEPARATOR */
7485 };
7486
Fred Drakee4315f52000-05-09 19:53:39 +00007487 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007488 unicode_freelist = NULL;
7489 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007491 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007492 for (i = 0; i < 256; i++)
7493 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007494 if (PyType_Ready(&PyUnicode_Type) < 0)
7495 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007496
7497 /* initialize the linebreak bloom filter */
7498 bloom_linebreak = make_bloom_mask(
7499 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7500 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501}
7502
7503/* Finalize the Unicode implementation */
7504
7505void
Thomas Wouters78890102000-07-22 19:25:51 +00007506_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007508 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007509 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007511 Py_XDECREF(unicode_empty);
7512 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007513
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007514 for (i = 0; i < 256; i++) {
7515 if (unicode_latin1[i]) {
7516 Py_DECREF(unicode_latin1[i]);
7517 unicode_latin1[i] = NULL;
7518 }
7519 }
7520
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007521 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 PyUnicodeObject *v = u;
7523 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007524 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007525 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007526 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007527 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007529 unicode_freelist = NULL;
7530 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007532
Anthony Baxterac6bd462006-04-13 02:06:09 +00007533#ifdef __cplusplus
7534}
7535#endif
7536
7537
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007538/*
7539Local variables:
7540c-basic-offset: 4
7541indent-tabs-mode: nil
7542End:
7543*/