blob: c5e87a8519e18bf3477c90e8377fc896fe32b6ff [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
Martin v. Löwis5cb69362006-04-14 09:08:42 +000039#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000040#include "Python.h"
41
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Fredrik Lundhb63588c2006-05-23 18:44:25 +000049#undef USE_INLINE /* XXX - set via configure? */
50
51#if defined(_MSC_VER) /* this is taken from _sre.c */
52#pragma warning(disable: 4710)
53/* fastest possible local call under MSVC */
54#define LOCAL(type) static __inline type __fastcall
55#elif defined(USE_INLINE)
56#define LOCAL(type) static inline type
57#else
58#define LOCAL(type) static type
59#endif
60
Guido van Rossumd57fd912000-03-10 22:53:23 +000061/* Limit for the Unicode object free list */
62
63#define MAX_UNICODE_FREELIST_SIZE 1024
64
65/* Limit for the Unicode object free list stay alive optimization.
66
67 The implementation will keep allocated Unicode memory intact for
68 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000069 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
Barry Warsaw51ac5802000-03-20 16:36:48 +000071 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000072 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000073 malloc()-overhead) bytes of unused garbage.
74
75 Setting the limit to 0 effectively turns the feature off.
76
Guido van Rossumfd4b9572000-04-10 13:51:10 +000077 Note: This is an experimental feature ! If you get core dumps when
78 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000079
80*/
81
Guido van Rossumfd4b9572000-04-10 13:51:10 +000082#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000083
84/* Endianness switches; defaults to little endian */
85
86#ifdef WORDS_BIGENDIAN
87# define BYTEORDER_IS_BIG_ENDIAN
88#else
89# define BYTEORDER_IS_LITTLE_ENDIAN
90#endif
91
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000092/* --- Globals ------------------------------------------------------------
93
94 The globals are initialized by the _PyUnicode_Init() API and should
95 not be used before calling that API.
96
97*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Anthony Baxterac6bd462006-04-13 02:06:09 +000099
100#ifdef __cplusplus
101extern "C" {
102#endif
103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105static PyUnicodeObject *unicode_freelist;
106static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
109static PyUnicodeObject *unicode_empty;
110
111/* Single character Unicode strings in the Latin-1 range are being
112 shared as well. */
113static PyUnicodeObject *unicode_latin1[256];
114
Fred Drakee4315f52000-05-09 19:53:39 +0000115/* Default encoding to use and assume when NULL is passed as encoding
116 parameter; it is initialized by _PyUnicode_Init().
117
118 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000119 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000120
121*/
Fred Drakee4315f52000-05-09 19:53:39 +0000122static char unicode_default_encoding[100];
123
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000124Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000125PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000126{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000127#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000128 return 0x10FFFF;
129#else
130 /* This is actually an illegal character, so it should
131 not be passed to unichr. */
132 return 0xFFFF;
133#endif
134}
135
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000136/* --- Bloom Filters ----------------------------------------------------- */
137
138/* stuff to implement simple "bloom filters" for Unicode characters.
139 to keep things simple, we use a single bitmask, using the least 5
140 bits from each unicode characters as the bit index. */
141
142/* the linebreak mask is set up by Unicode_Init below */
143
144#define BLOOM_MASK unsigned long
145
146static BLOOM_MASK bloom_linebreak;
147
148#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
149
150#define BLOOM_LINEBREAK(ch)\
151 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
152
153LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
154{
155 /* calculate simple bloom-style bitmask for a given unicode string */
156
157 long mask;
158 Py_ssize_t i;
159
160 mask = 0;
161 for (i = 0; i < len; i++)
162 mask |= (1 << (ptr[i] & 0x1F));
163
164 return mask;
165}
166
167LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
168{
169 Py_ssize_t i;
170
171 for (i = 0; i < setlen; i++)
172 if (set[i] == chr)
173 return 1;
174
Fredrik Lundh77633512006-05-23 19:47:35 +0000175 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000176}
177
178#define BLOOM_MEMBER(mask, chr, set, setlen)\
179 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
180
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181/* --- Unicode Object ----------------------------------------------------- */
182
183static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000185 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186{
187 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000188
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000189 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000191 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 /* Resizing shared object (unicode_empty or single character
194 objects) in-place is not allowed. Use PyUnicode_Resize()
195 instead ! */
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000196 if (unicode == unicode_empty ||
197 (unicode->length == 1 &&
198 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000200 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 return -1;
203 }
204
205 /* We allocate one more byte to make sure the string is
206 Ux0000 terminated -- XXX is this needed ? */
207 oldstr = unicode->str;
208 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
209 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000210 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 PyErr_NoMemory();
212 return -1;
213 }
214 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000215 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000217 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000219 if (unicode->defenc) {
220 Py_DECREF(unicode->defenc);
221 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 }
223 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000224
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 return 0;
226}
227
228/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000229 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230
231 XXX This allocator could further be enhanced by assuring that the
232 free list never reduces its size below 1.
233
234*/
235
236static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000237PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238{
239 register PyUnicodeObject *unicode;
240
Tim Petersced69f82003-09-16 20:30:58 +0000241 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (length == 0 && unicode_empty != NULL) {
243 Py_INCREF(unicode_empty);
244 return unicode_empty;
245 }
246
247 /* Unicode freelist & memory allocation */
248 if (unicode_freelist) {
249 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000250 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization: we only upsize the buffer,
254 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000255 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000256 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000257 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000258 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 }
260 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000261 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000263 }
264 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 }
266 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000267 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 if (unicode == NULL)
269 return NULL;
270 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
271 }
272
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000273 if (!unicode->str) {
274 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000275 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000276 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000277 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000278 * the caller fails before initializing str -- unicode_resize()
279 * reads str[0], and the Keep-Alive optimization can keep memory
280 * allocated for str alive across a call to unicode_dealloc(unicode).
281 * We don't want unicode_resize to read uninitialized memory in
282 * that case.
283 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000284 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000286 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000288 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000290
291 onError:
292 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000293 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295}
296
297static
Guido van Rossum9475a232001-10-05 20:51:39 +0000298void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000300 if (PyUnicode_CheckExact(unicode) &&
301 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000302 /* Keep-Alive optimization */
303 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000304 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 unicode->str = NULL;
306 unicode->length = 0;
307 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000308 if (unicode->defenc) {
309 Py_DECREF(unicode->defenc);
310 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000311 }
312 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313 *(PyUnicodeObject **)unicode = unicode_freelist;
314 unicode_freelist = unicode;
315 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000318 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000319 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000320 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321 }
322}
323
Martin v. Löwis18e16552006-02-15 17:27:45 +0000324int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000325{
326 register PyUnicodeObject *v;
327
328 /* Argument checks */
329 if (unicode == NULL) {
330 PyErr_BadInternalCall();
331 return -1;
332 }
333 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000334 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000335 PyErr_BadInternalCall();
336 return -1;
337 }
338
339 /* Resizing unicode_empty and single character objects is not
340 possible since these are being shared. We simply return a fresh
341 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000342 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000343 (v == unicode_empty || v->length == 1)) {
344 PyUnicodeObject *w = _PyUnicode_New(length);
345 if (w == NULL)
346 return -1;
347 Py_UNICODE_COPY(w->str, v->str,
348 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000349 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000350 *unicode = (PyObject *)w;
351 return 0;
352 }
353
354 /* Note that we don't have to modify *unicode for unshared Unicode
355 objects, since we can modify them in-place. */
356 return unicode_resize(v, length);
357}
358
359/* Internal API for use in unicodeobject.c only ! */
360#define _PyUnicode_Resize(unicodevar, length) \
361 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000364 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365{
366 PyUnicodeObject *unicode;
367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000368 /* If the Unicode data is known at construction time, we can apply
369 some optimizations which share commonly used objects. */
370 if (u != NULL) {
371
372 /* Optimization for empty strings */
373 if (size == 0 && unicode_empty != NULL) {
374 Py_INCREF(unicode_empty);
375 return (PyObject *)unicode_empty;
376 }
377
378 /* Single character Unicode objects in the Latin-1 range are
379 shared when using this constructor */
380 if (size == 1 && *u < 256) {
381 unicode = unicode_latin1[*u];
382 if (!unicode) {
383 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000384 if (!unicode)
385 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000386 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000387 unicode_latin1[*u] = unicode;
388 }
389 Py_INCREF(unicode);
390 return (PyObject *)unicode;
391 }
392 }
Tim Petersced69f82003-09-16 20:30:58 +0000393
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode = _PyUnicode_New(size);
395 if (!unicode)
396 return NULL;
397
398 /* Copy the Unicode data into the new object */
399 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000400 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401
402 return (PyObject *)unicode;
403}
404
405#ifdef HAVE_WCHAR_H
406
407PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000408 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000409{
410 PyUnicodeObject *unicode;
411
412 if (w == NULL) {
413 PyErr_BadInternalCall();
414 return NULL;
415 }
416
417 unicode = _PyUnicode_New(size);
418 if (!unicode)
419 return NULL;
420
421 /* Copy the wchar_t data into the new object */
422#ifdef HAVE_USABLE_WCHAR_T
423 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000424#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 {
426 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000427 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000429 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430 *u++ = *w++;
431 }
432#endif
433
434 return (PyObject *)unicode;
435}
436
Martin v. Löwis18e16552006-02-15 17:27:45 +0000437Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
438 wchar_t *w,
439 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440{
441 if (unicode == NULL) {
442 PyErr_BadInternalCall();
443 return -1;
444 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000445
446 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000448 size = PyUnicode_GET_SIZE(unicode) + 1;
449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450#ifdef HAVE_USABLE_WCHAR_T
451 memcpy(w, unicode->str, size * sizeof(wchar_t));
452#else
453 {
454 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000455 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000457 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 *w++ = *u++;
459 }
460#endif
461
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000462 if (size > PyUnicode_GET_SIZE(unicode))
463 return PyUnicode_GET_SIZE(unicode);
464 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465 return size;
466}
467
468#endif
469
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000470PyObject *PyUnicode_FromOrdinal(int ordinal)
471{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000472 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000473
474#ifdef Py_UNICODE_WIDE
475 if (ordinal < 0 || ordinal > 0x10ffff) {
476 PyErr_SetString(PyExc_ValueError,
477 "unichr() arg not in range(0x110000) "
478 "(wide Python build)");
479 return NULL;
480 }
481#else
482 if (ordinal < 0 || ordinal > 0xffff) {
483 PyErr_SetString(PyExc_ValueError,
484 "unichr() arg not in range(0x10000) "
485 "(narrow Python build)");
486 return NULL;
487 }
488#endif
489
Hye-Shik Chang40574832004-04-06 07:24:51 +0000490 s[0] = (Py_UNICODE)ordinal;
491 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000492}
493
Guido van Rossumd57fd912000-03-10 22:53:23 +0000494PyObject *PyUnicode_FromObject(register PyObject *obj)
495{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000496 /* XXX Perhaps we should make this API an alias of
497 PyObject_Unicode() instead ?! */
498 if (PyUnicode_CheckExact(obj)) {
499 Py_INCREF(obj);
500 return obj;
501 }
502 if (PyUnicode_Check(obj)) {
503 /* For a Unicode subtype that's not a Unicode object,
504 return a true Unicode object with the same data. */
505 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
506 PyUnicode_GET_SIZE(obj));
507 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
509}
510
511PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
512 const char *encoding,
513 const char *errors)
514{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000515 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000516 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000518
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519 if (obj == NULL) {
520 PyErr_BadInternalCall();
521 return NULL;
522 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000524#if 0
525 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000526 that no encodings is given and then redirect to
527 PyObject_Unicode() which then applies the additional logic for
528 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000530 NOTE: This API should really only be used for object which
531 represent *encoded* Unicode !
532
533 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 if (PyUnicode_Check(obj)) {
535 if (encoding) {
536 PyErr_SetString(PyExc_TypeError,
537 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000538 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000539 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000540 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000541 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000542#else
543 if (PyUnicode_Check(obj)) {
544 PyErr_SetString(PyExc_TypeError,
545 "decoding Unicode is not supported");
546 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000547 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000548#endif
549
550 /* Coerce object */
551 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000552 s = PyString_AS_STRING(obj);
553 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
556 /* Overwrite the error message with something more useful in
557 case of a TypeError. */
558 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000559 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000560 "coercing to Unicode: need string or buffer, "
561 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000562 obj->ob_type->tp_name);
563 goto onError;
564 }
Tim Petersced69f82003-09-16 20:30:58 +0000565
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000566 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 if (len == 0) {
568 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 }
Tim Petersced69f82003-09-16 20:30:58 +0000571 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000573
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000574 return v;
575
576 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578}
579
580PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000581 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 const char *encoding,
583 const char *errors)
584{
585 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000586
587 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000588 encoding = PyUnicode_GetDefaultEncoding();
589
590 /* Shortcuts for common default encodings */
591 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000592 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000593 else if (strcmp(encoding, "latin-1") == 0)
594 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
596 else if (strcmp(encoding, "mbcs") == 0)
597 return PyUnicode_DecodeMBCS(s, size, errors);
598#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000599 else if (strcmp(encoding, "ascii") == 0)
600 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000601
602 /* Decode via the codec registry */
603 buffer = PyBuffer_FromMemory((void *)s, size);
604 if (buffer == NULL)
605 goto onError;
606 unicode = PyCodec_Decode(buffer, encoding, errors);
607 if (unicode == NULL)
608 goto onError;
609 if (!PyUnicode_Check(unicode)) {
610 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000611 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612 unicode->ob_type->tp_name);
613 Py_DECREF(unicode);
614 goto onError;
615 }
616 Py_DECREF(buffer);
617 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000618
Guido van Rossumd57fd912000-03-10 22:53:23 +0000619 onError:
620 Py_XDECREF(buffer);
621 return NULL;
622}
623
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000624PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
625 const char *encoding,
626 const char *errors)
627{
628 PyObject *v;
629
630 if (!PyUnicode_Check(unicode)) {
631 PyErr_BadArgument();
632 goto onError;
633 }
634
635 if (encoding == NULL)
636 encoding = PyUnicode_GetDefaultEncoding();
637
638 /* Decode via the codec registry */
639 v = PyCodec_Decode(unicode, encoding, errors);
640 if (v == NULL)
641 goto onError;
642 return v;
643
644 onError:
645 return NULL;
646}
647
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000649 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 const char *encoding,
651 const char *errors)
652{
653 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000654
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655 unicode = PyUnicode_FromUnicode(s, size);
656 if (unicode == NULL)
657 return NULL;
658 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
659 Py_DECREF(unicode);
660 return v;
661}
662
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000663PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
664 const char *encoding,
665 const char *errors)
666{
667 PyObject *v;
668
669 if (!PyUnicode_Check(unicode)) {
670 PyErr_BadArgument();
671 goto onError;
672 }
673
674 if (encoding == NULL)
675 encoding = PyUnicode_GetDefaultEncoding();
676
677 /* Encode via the codec registry */
678 v = PyCodec_Encode(unicode, encoding, errors);
679 if (v == NULL)
680 goto onError;
681 return v;
682
683 onError:
684 return NULL;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
688 const char *encoding,
689 const char *errors)
690{
691 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000692
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 if (!PyUnicode_Check(unicode)) {
694 PyErr_BadArgument();
695 goto onError;
696 }
Fred Drakee4315f52000-05-09 19:53:39 +0000697
Tim Petersced69f82003-09-16 20:30:58 +0000698 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000699 encoding = PyUnicode_GetDefaultEncoding();
700
701 /* Shortcuts for common default encodings */
702 if (errors == NULL) {
703 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000704 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000705 else if (strcmp(encoding, "latin-1") == 0)
706 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000707#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
708 else if (strcmp(encoding, "mbcs") == 0)
709 return PyUnicode_AsMBCSString(unicode);
710#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000711 else if (strcmp(encoding, "ascii") == 0)
712 return PyUnicode_AsASCIIString(unicode);
713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 /* Encode via the codec registry */
716 v = PyCodec_Encode(unicode, encoding, errors);
717 if (v == NULL)
718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 if (!PyString_Check(v)) {
720 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000721 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 v->ob_type->tp_name);
723 Py_DECREF(v);
724 goto onError;
725 }
726 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000727
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 onError:
729 return NULL;
730}
731
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000732PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
733 const char *errors)
734{
735 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
736
737 if (v)
738 return v;
739 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
740 if (v && errors == NULL)
741 ((PyUnicodeObject *)unicode)->defenc = v;
742 return v;
743}
744
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
746{
747 if (!PyUnicode_Check(unicode)) {
748 PyErr_BadArgument();
749 goto onError;
750 }
751 return PyUnicode_AS_UNICODE(unicode);
752
753 onError:
754 return NULL;
755}
756
Martin v. Löwis18e16552006-02-15 17:27:45 +0000757Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
759 if (!PyUnicode_Check(unicode)) {
760 PyErr_BadArgument();
761 goto onError;
762 }
763 return PyUnicode_GET_SIZE(unicode);
764
765 onError:
766 return -1;
767}
768
Thomas Wouters78890102000-07-22 19:25:51 +0000769const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000770{
771 return unicode_default_encoding;
772}
773
774int PyUnicode_SetDefaultEncoding(const char *encoding)
775{
776 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000777
Fred Drakee4315f52000-05-09 19:53:39 +0000778 /* Make sure the encoding is valid. As side effect, this also
779 loads the encoding into the codec registry cache. */
780 v = _PyCodec_Lookup(encoding);
781 if (v == NULL)
782 goto onError;
783 Py_DECREF(v);
784 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000785 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000786 sizeof(unicode_default_encoding));
787 return 0;
788
789 onError:
790 return -1;
791}
792
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000793/* error handling callback helper:
794 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000795 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000796 and adjust various state variables.
797 return 0 on success, -1 on error
798*/
799
800static
801int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
802 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000803 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
804 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000805{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807
808 PyObject *restuple = NULL;
809 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
811 Py_ssize_t requiredsize;
812 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000814 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815 int res = -1;
816
817 if (*errorHandler == NULL) {
818 *errorHandler = PyCodec_LookupError(errors);
819 if (*errorHandler == NULL)
820 goto onError;
821 }
822
823 if (*exceptionObject == NULL) {
824 *exceptionObject = PyUnicodeDecodeError_Create(
825 encoding, input, insize, *startinpos, *endinpos, reason);
826 if (*exceptionObject == NULL)
827 goto onError;
828 }
829 else {
830 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
831 goto onError;
832 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
833 goto onError;
834 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
835 goto onError;
836 }
837
838 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
839 if (restuple == NULL)
840 goto onError;
841 if (!PyTuple_Check(restuple)) {
842 PyErr_Format(PyExc_TypeError, &argparse[4]);
843 goto onError;
844 }
845 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
846 goto onError;
847 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000848 newpos = insize+newpos;
849 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000850 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000851 goto onError;
852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853
854 /* need more space? (at least enough for what we
855 have+the replacement+the rest of the string (starting
856 at the new input position), so we won't have to check space
857 when there are no errors in the rest of the string) */
858 repptr = PyUnicode_AS_UNICODE(repunicode);
859 repsize = PyUnicode_GET_SIZE(repunicode);
860 requiredsize = *outpos + repsize + insize-newpos;
861 if (requiredsize > outsize) {
862 if (requiredsize<2*outsize)
863 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000864 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000865 goto onError;
866 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
867 }
868 *endinpos = newpos;
869 *inptr = input + newpos;
870 Py_UNICODE_COPY(*outptr, repptr, repsize);
871 *outptr += repsize;
872 *outpos += repsize;
873 /* we made it! */
874 res = 0;
875
876 onError:
877 Py_XDECREF(restuple);
878 return res;
879}
880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881/* --- UTF-7 Codec -------------------------------------------------------- */
882
883/* see RFC2152 for details */
884
Tim Petersced69f82003-09-16 20:30:58 +0000885static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000886char utf7_special[128] = {
887 /* indicate whether a UTF-7 character is special i.e. cannot be directly
888 encoded:
889 0 - not special
890 1 - special
891 2 - whitespace (optional)
892 3 - RFC2152 Set O (optional) */
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
895 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
897 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
899 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
901
902};
903
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000904/* Note: The comparison (c) <= 0 is a trick to work-around gcc
905 warnings about the comparison always being false; since
906 utf7_special[0] is 1, we can safely make that one comparison
907 true */
908
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000909#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000911 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000912 (encodeO && (utf7_special[(c)] == 3)))
913
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000914#define B64(n) \
915 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
916#define B64CHAR(c) \
917 (isalnum(c) || (c) == '+' || (c) == '/')
918#define UB64(c) \
919 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
920 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000922#define ENCODE(out, ch, bits) \
923 while (bits >= 6) { \
924 *out++ = B64(ch >> (bits-6)); \
925 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000926 }
927
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000928#define DECODE(out, ch, bits, surrogate) \
929 while (bits >= 16) { \
930 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
931 bits -= 16; \
932 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000933 /* We have already generated an error for the high surrogate \
934 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000935 surrogate = 0; \
936 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000937 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000938 it in a 16-bit character */ \
939 surrogate = 1; \
940 errmsg = "code pairs are not supported"; \
941 goto utf7Error; \
942 } else { \
943 *out++ = outCh; \
944 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000945 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000947PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 const char *errors)
950{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000952 Py_ssize_t startinpos;
953 Py_ssize_t endinpos;
954 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 const char *e;
956 PyUnicodeObject *unicode;
957 Py_UNICODE *p;
958 const char *errmsg = "";
959 int inShift = 0;
960 unsigned int bitsleft = 0;
961 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000962 int surrogate = 0;
963 PyObject *errorHandler = NULL;
964 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965
966 unicode = _PyUnicode_New(size);
967 if (!unicode)
968 return NULL;
969 if (size == 0)
970 return (PyObject *)unicode;
971
972 p = unicode->str;
973 e = s + size;
974
975 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000976 Py_UNICODE ch;
977 restart:
978 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000979
980 if (inShift) {
981 if ((ch == '-') || !B64CHAR(ch)) {
982 inShift = 0;
983 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000984
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000985 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
986 if (bitsleft >= 6) {
987 /* The shift sequence has a partial character in it. If
988 bitsleft < 6 then we could just classify it as padding
989 but that is not the case here */
990
991 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000992 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000993 }
994 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000995 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000996 here so indicate the potential of a misencoded character. */
997
998 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
999 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1000 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001001 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001002 }
1003
1004 if (ch == '-') {
1005 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001006 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 inShift = 1;
1008 }
1009 } else if (SPECIAL(ch,0,0)) {
1010 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001011 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 } else {
1013 *p++ = ch;
1014 }
1015 } else {
1016 charsleft = (charsleft << 6) | UB64(ch);
1017 bitsleft += 6;
1018 s++;
1019 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1020 }
1021 }
1022 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001023 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001024 s++;
1025 if (s < e && *s == '-') {
1026 s++;
1027 *p++ = '+';
1028 } else
1029 {
1030 inShift = 1;
1031 bitsleft = 0;
1032 }
1033 }
1034 else if (SPECIAL(ch,0,0)) {
1035 errmsg = "unexpected special character";
1036 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001037 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 }
1039 else {
1040 *p++ = ch;
1041 s++;
1042 }
1043 continue;
1044 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001045 outpos = p-PyUnicode_AS_UNICODE(unicode);
1046 endinpos = s-starts;
1047 if (unicode_decode_call_errorhandler(
1048 errors, &errorHandler,
1049 "utf7", errmsg,
1050 starts, size, &startinpos, &endinpos, &exc, &s,
1051 (PyObject **)&unicode, &outpos, &p))
1052 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 }
1054
1055 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001056 outpos = p-PyUnicode_AS_UNICODE(unicode);
1057 endinpos = size;
1058 if (unicode_decode_call_errorhandler(
1059 errors, &errorHandler,
1060 "utf7", "unterminated shift sequence",
1061 starts, size, &startinpos, &endinpos, &exc, &s,
1062 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001064 if (s < e)
1065 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 }
1067
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001068 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 goto onError;
1070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001073 return (PyObject *)unicode;
1074
1075onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076 Py_XDECREF(errorHandler);
1077 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001078 Py_DECREF(unicode);
1079 return NULL;
1080}
1081
1082
1083PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001084 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 int encodeSetO,
1086 int encodeWhiteSpace,
1087 const char *errors)
1088{
1089 PyObject *v;
1090 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001091 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001092 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001093 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001094 unsigned int bitsleft = 0;
1095 unsigned long charsleft = 0;
1096 char * out;
1097 char * start;
1098
1099 if (size == 0)
1100 return PyString_FromStringAndSize(NULL, 0);
1101
1102 v = PyString_FromStringAndSize(NULL, cbAllocated);
1103 if (v == NULL)
1104 return NULL;
1105
1106 start = out = PyString_AS_STRING(v);
1107 for (;i < size; ++i) {
1108 Py_UNICODE ch = s[i];
1109
1110 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001111 if (ch == '+') {
1112 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001113 *out++ = '-';
1114 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1115 charsleft = ch;
1116 bitsleft = 16;
1117 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001118 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001119 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001120 } else {
1121 *out++ = (char) ch;
1122 }
1123 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001124 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1125 *out++ = B64(charsleft << (6-bitsleft));
1126 charsleft = 0;
1127 bitsleft = 0;
1128 /* Characters not in the BASE64 set implicitly unshift the sequence
1129 so no '-' is required, except if the character is itself a '-' */
1130 if (B64CHAR(ch) || ch == '-') {
1131 *out++ = '-';
1132 }
1133 inShift = 0;
1134 *out++ = (char) ch;
1135 } else {
1136 bitsleft += 16;
1137 charsleft = (charsleft << 16) | ch;
1138 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1139
1140 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001141 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001142 or '-' then the shift sequence will be terminated implicitly and we
1143 don't have to insert a '-'. */
1144
1145 if (bitsleft == 0) {
1146 if (i + 1 < size) {
1147 Py_UNICODE ch2 = s[i+1];
1148
1149 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001150
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001151 } else if (B64CHAR(ch2) || ch2 == '-') {
1152 *out++ = '-';
1153 inShift = 0;
1154 } else {
1155 inShift = 0;
1156 }
1157
1158 }
1159 else {
1160 *out++ = '-';
1161 inShift = 0;
1162 }
1163 }
Tim Petersced69f82003-09-16 20:30:58 +00001164 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001165 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001166 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001167 if (bitsleft) {
1168 *out++= B64(charsleft << (6-bitsleft) );
1169 *out++ = '-';
1170 }
1171
Tim Peters5de98422002-04-27 18:44:32 +00001172 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001173 return v;
1174}
1175
1176#undef SPECIAL
1177#undef B64
1178#undef B64CHAR
1179#undef UB64
1180#undef ENCODE
1181#undef DECODE
1182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183/* --- UTF-8 Codec -------------------------------------------------------- */
1184
Tim Petersced69f82003-09-16 20:30:58 +00001185static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186char utf8_code_length[256] = {
1187 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1188 illegal prefix. see RFC 2279 for details */
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1195 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1196 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1203 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1204 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1205};
1206
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001208 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 const char *errors)
1210{
Walter Dörwald69652032004-09-07 20:24:22 +00001211 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1212}
1213
1214PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001215 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001216 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001221 Py_ssize_t startinpos;
1222 Py_ssize_t endinpos;
1223 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *e;
1225 PyUnicodeObject *unicode;
1226 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001228 PyObject *errorHandler = NULL;
1229 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231 /* Note: size will always be longer than the resulting Unicode
1232 character count */
1233 unicode = _PyUnicode_New(size);
1234 if (!unicode)
1235 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001236 if (size == 0) {
1237 if (consumed)
1238 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241
1242 /* Unpack UTF-8 encoded data */
1243 p = unicode->str;
1244 e = s + size;
1245
1246 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001247 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248
1249 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001250 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 s++;
1252 continue;
1253 }
1254
1255 n = utf8_code_length[ch];
1256
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001257 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001258 if (consumed)
1259 break;
1260 else {
1261 errmsg = "unexpected end of data";
1262 startinpos = s-starts;
1263 endinpos = size;
1264 goto utf8Error;
1265 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267
1268 switch (n) {
1269
1270 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001272 startinpos = s-starts;
1273 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001274 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275
1276 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281
1282 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001283 if ((s[1] & 0xc0) != 0x80) {
1284 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001285 startinpos = s-starts;
1286 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001287 goto utf8Error;
1288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001290 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001291 startinpos = s-starts;
1292 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001293 errmsg = "illegal encoding";
1294 goto utf8Error;
1295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001297 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 break;
1299
1300 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001301 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001302 (s[2] & 0xc0) != 0x80) {
1303 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001304 startinpos = s-starts;
1305 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 goto utf8Error;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001309 if (ch < 0x0800) {
1310 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001311 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001312
1313 XXX For wide builds (UCS-4) we should probably try
1314 to recombine the surrogates into a single code
1315 unit.
1316 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001317 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001318 startinpos = s-starts;
1319 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001320 goto utf8Error;
1321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001323 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001324 break;
1325
1326 case 4:
1327 if ((s[1] & 0xc0) != 0x80 ||
1328 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 (s[3] & 0xc0) != 0x80) {
1330 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 startinpos = s-starts;
1332 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001333 goto utf8Error;
1334 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001335 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1336 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1337 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001338 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001339 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001340 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001341 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001342 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001343 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001344 startinpos = s-starts;
1345 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001346 goto utf8Error;
1347 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001348#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001349 *p++ = (Py_UNICODE)ch;
1350#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001351 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001352
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001353 /* translate from 10000..10FFFF to 0..FFFF */
1354 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001355
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001356 /* high surrogate = top 10 bits added to D800 */
1357 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001358
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001359 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001360 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001361#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 break;
1363
1364 default:
1365 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001366 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001367 startinpos = s-starts;
1368 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001369 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370 }
1371 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001372 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001373
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001375 outpos = p-PyUnicode_AS_UNICODE(unicode);
1376 if (unicode_decode_call_errorhandler(
1377 errors, &errorHandler,
1378 "utf8", errmsg,
1379 starts, size, &startinpos, &endinpos, &exc, &s,
1380 (PyObject **)&unicode, &outpos, &p))
1381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 }
Walter Dörwald69652032004-09-07 20:24:22 +00001383 if (consumed)
1384 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385
1386 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001387 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 goto onError;
1389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 return (PyObject *)unicode;
1393
1394onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395 Py_XDECREF(errorHandler);
1396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001397 Py_DECREF(unicode);
1398 return NULL;
1399}
1400
Tim Peters602f7402002-04-27 18:03:26 +00001401/* Allocation strategy: if the string is short, convert into a stack buffer
1402 and allocate exactly as much space needed at the end. Else allocate the
1403 maximum possible needed (4 result bytes per Unicode character), and return
1404 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001405*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001406PyObject *
1407PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001408 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Tim Peters602f7402002-04-27 18:03:26 +00001411#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001412
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001414 PyObject *v; /* result string object */
1415 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001416 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001417 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001418 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 assert(s != NULL);
1421 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422
Tim Peters602f7402002-04-27 18:03:26 +00001423 if (size <= MAX_SHORT_UNICHARS) {
1424 /* Write into the stack buffer; nallocated can't overflow.
1425 * At the end, we'll allocate exactly as much heap space as it
1426 * turns out we need.
1427 */
1428 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1429 v = NULL; /* will allocate after we're done */
1430 p = stackbuf;
1431 }
1432 else {
1433 /* Overallocate on the heap, and give the excess back at the end. */
1434 nallocated = size * 4;
1435 if (nallocated / 4 != size) /* overflow! */
1436 return PyErr_NoMemory();
1437 v = PyString_FromStringAndSize(NULL, nallocated);
1438 if (v == NULL)
1439 return NULL;
1440 p = PyString_AS_STRING(v);
1441 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001442
Tim Peters602f7402002-04-27 18:03:26 +00001443 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001444 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001445
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001446 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001447 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001449
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001451 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001452 *p++ = (char)(0xc0 | (ch >> 6));
1453 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001454 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001455 else {
Tim Peters602f7402002-04-27 18:03:26 +00001456 /* Encode UCS2 Unicode ordinals */
1457 if (ch < 0x10000) {
1458 /* Special case: check for high surrogate */
1459 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1460 Py_UCS4 ch2 = s[i];
1461 /* Check for low surrogate and combine the two to
1462 form a UCS4 value */
1463 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001464 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001465 i++;
1466 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001467 }
Tim Peters602f7402002-04-27 18:03:26 +00001468 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001469 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001470 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001471 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1472 *p++ = (char)(0x80 | (ch & 0x3f));
1473 continue;
1474 }
1475encodeUCS4:
1476 /* Encode UCS4 Unicode ordinals */
1477 *p++ = (char)(0xf0 | (ch >> 18));
1478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1480 *p++ = (char)(0x80 | (ch & 0x3f));
1481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001483
Tim Peters602f7402002-04-27 18:03:26 +00001484 if (v == NULL) {
1485 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001486 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001487 assert(nneeded <= nallocated);
1488 v = PyString_FromStringAndSize(stackbuf, nneeded);
1489 }
1490 else {
1491 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001492 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001493 assert(nneeded <= nallocated);
1494 _PyString_Resize(&v, nneeded);
1495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001497
Tim Peters602f7402002-04-27 18:03:26 +00001498#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499}
1500
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (!PyUnicode_Check(unicode)) {
1504 PyErr_BadArgument();
1505 return NULL;
1506 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001507 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1508 PyUnicode_GET_SIZE(unicode),
1509 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510}
1511
1512/* --- UTF-16 Codec ------------------------------------------------------- */
1513
Tim Peters772747b2001-08-09 22:21:55 +00001514PyObject *
1515PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001516 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001517 const char *errors,
1518 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519{
Walter Dörwald69652032004-09-07 20:24:22 +00001520 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1521}
1522
1523PyObject *
1524PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001525 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001526 const char *errors,
1527 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001528 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001531 Py_ssize_t startinpos;
1532 Py_ssize_t endinpos;
1533 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 PyUnicodeObject *unicode;
1535 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001536 const unsigned char *q, *e;
1537 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001538 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001539 /* Offsets from q for retrieving byte pairs in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541 int ihi = 1, ilo = 0;
1542#else
1543 int ihi = 0, ilo = 1;
1544#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 PyObject *errorHandler = NULL;
1546 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547
1548 /* Note: size will always be longer than the resulting Unicode
1549 character count */
1550 unicode = _PyUnicode_New(size);
1551 if (!unicode)
1552 return NULL;
1553 if (size == 0)
1554 return (PyObject *)unicode;
1555
1556 /* Unpack UTF-16 encoded data */
1557 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001558 q = (unsigned char *)s;
1559 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
1561 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001562 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001564 /* Check for BOM marks (U+FEFF) in the input and adjust current
1565 byte order setting accordingly. In native mode, the leading BOM
1566 mark is skipped, in all other modes, it is copied to the output
1567 stream as-is (giving a ZWNBSP character). */
1568 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001569 if (size >= 2) {
1570 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001571#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001572 if (bom == 0xFEFF) {
1573 q += 2;
1574 bo = -1;
1575 }
1576 else if (bom == 0xFFFE) {
1577 q += 2;
1578 bo = 1;
1579 }
Tim Petersced69f82003-09-16 20:30:58 +00001580#else
Walter Dörwald69652032004-09-07 20:24:22 +00001581 if (bom == 0xFEFF) {
1582 q += 2;
1583 bo = 1;
1584 }
1585 else if (bom == 0xFFFE) {
1586 q += 2;
1587 bo = -1;
1588 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001589#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001590 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592
Tim Peters772747b2001-08-09 22:21:55 +00001593 if (bo == -1) {
1594 /* force LE */
1595 ihi = 1;
1596 ilo = 0;
1597 }
1598 else if (bo == 1) {
1599 /* force BE */
1600 ihi = 0;
1601 ilo = 1;
1602 }
1603
1604 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001605 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001606 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001607 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001608 if (consumed)
1609 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001610 errmsg = "truncated data";
1611 startinpos = ((const char *)q)-starts;
1612 endinpos = ((const char *)e)-starts;
1613 goto utf16Error;
1614 /* The remaining input chars are ignored if the callback
1615 chooses to skip the input */
1616 }
1617 ch = (q[ihi] << 8) | q[ilo];
1618
Tim Peters772747b2001-08-09 22:21:55 +00001619 q += 2;
1620
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 if (ch < 0xD800 || ch > 0xDFFF) {
1622 *p++ = ch;
1623 continue;
1624 }
1625
1626 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001627 if (q >= e) {
1628 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001629 startinpos = (((const char *)q)-2)-starts;
1630 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001631 goto utf16Error;
1632 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001633 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001634 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1635 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001636 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001637#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001638 *p++ = ch;
1639 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#else
1641 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001643 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001644 }
1645 else {
1646 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001647 startinpos = (((const char *)q)-4)-starts;
1648 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001649 goto utf16Error;
1650 }
1651
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001653 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001654 startinpos = (((const char *)q)-2)-starts;
1655 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001656 /* Fall through to report the error */
1657
1658 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001659 outpos = p-PyUnicode_AS_UNICODE(unicode);
1660 if (unicode_decode_call_errorhandler(
1661 errors, &errorHandler,
1662 "utf16", errmsg,
1663 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1664 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001666 }
1667
1668 if (byteorder)
1669 *byteorder = bo;
1670
Walter Dörwald69652032004-09-07 20:24:22 +00001671 if (consumed)
1672 *consumed = (const char *)q-starts;
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001675 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 goto onError;
1677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001678 Py_XDECREF(errorHandler);
1679 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001680 return (PyObject *)unicode;
1681
1682onError:
1683 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001684 Py_XDECREF(errorHandler);
1685 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686 return NULL;
1687}
1688
Tim Peters772747b2001-08-09 22:21:55 +00001689PyObject *
1690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001691 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001692 const char *errors,
1693 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694{
1695 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001696 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001697#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001698 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001699#else
1700 const int pairs = 0;
1701#endif
Tim Peters772747b2001-08-09 22:21:55 +00001702 /* Offsets from p for storing byte pairs in the right order. */
1703#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1704 int ihi = 1, ilo = 0;
1705#else
1706 int ihi = 0, ilo = 1;
1707#endif
1708
1709#define STORECHAR(CH) \
1710 do { \
1711 p[ihi] = ((CH) >> 8) & 0xff; \
1712 p[ilo] = (CH) & 0xff; \
1713 p += 2; \
1714 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001716#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001717 for (i = pairs = 0; i < size; i++)
1718 if (s[i] >= 0x10000)
1719 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001720#endif
Tim Petersced69f82003-09-16 20:30:58 +00001721 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001722 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 if (v == NULL)
1724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Tim Peters772747b2001-08-09 22:21:55 +00001726 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001728 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001729 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001730 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001731
1732 if (byteorder == -1) {
1733 /* force LE */
1734 ihi = 1;
1735 ilo = 0;
1736 }
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 ihi = 0;
1740 ilo = 1;
1741 }
1742
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001743 while (size-- > 0) {
1744 Py_UNICODE ch = *s++;
1745 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001746#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001747 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001748 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1749 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001751#endif
Tim Peters772747b2001-08-09 22:21:55 +00001752 STORECHAR(ch);
1753 if (ch2)
1754 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001757#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758}
1759
1760PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1761{
1762 if (!PyUnicode_Check(unicode)) {
1763 PyErr_BadArgument();
1764 return NULL;
1765 }
1766 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1767 PyUnicode_GET_SIZE(unicode),
1768 NULL,
1769 0);
1770}
1771
1772/* --- Unicode Escape Codec ----------------------------------------------- */
1773
Fredrik Lundh06d12682001-01-24 07:59:11 +00001774static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001775
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001777 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 const char *errors)
1779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001780 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001781 Py_ssize_t startinpos;
1782 Py_ssize_t endinpos;
1783 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001784 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001788 char* message;
1789 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001790 PyObject *errorHandler = NULL;
1791 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001792
Guido van Rossumd57fd912000-03-10 22:53:23 +00001793 /* Escaped strings will always be longer than the resulting
1794 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 length after conversion to the true value.
1796 (but if the error callback returns a long replacement string
1797 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 v = _PyUnicode_New(size);
1799 if (v == NULL)
1800 goto onError;
1801 if (size == 0)
1802 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001804 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 while (s < end) {
1808 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001809 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811
1812 /* Non-escape characters are interpreted as Unicode ordinals */
1813 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001814 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 continue;
1816 }
1817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 /* \ - Escapes */
1820 s++;
1821 switch (*s++) {
1822
1823 /* \x escapes */
1824 case '\n': break;
1825 case '\\': *p++ = '\\'; break;
1826 case '\'': *p++ = '\''; break;
1827 case '\"': *p++ = '\"'; break;
1828 case 'b': *p++ = '\b'; break;
1829 case 'f': *p++ = '\014'; break; /* FF */
1830 case 't': *p++ = '\t'; break;
1831 case 'n': *p++ = '\n'; break;
1832 case 'r': *p++ = '\r'; break;
1833 case 'v': *p++ = '\013'; break; /* VT */
1834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1835
1836 /* \OOO (octal) escapes */
1837 case '0': case '1': case '2': case '3':
1838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001839 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001841 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 break;
1847
Fredrik Lundhccc74732001-02-18 22:13:49 +00001848 /* hex escapes */
1849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001851 digits = 2;
1852 message = "truncated \\xXX escape";
1853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Fredrik Lundhccc74732001-02-18 22:13:49 +00001855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001857 digits = 4;
1858 message = "truncated \\uXXXX escape";
1859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860
Fredrik Lundhccc74732001-02-18 22:13:49 +00001861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001863 digits = 8;
1864 message = "truncated \\UXXXXXXXX escape";
1865 hexescape:
1866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 outpos = p-PyUnicode_AS_UNICODE(v);
1868 if (s+digits>end) {
1869 endinpos = size;
1870 if (unicode_decode_call_errorhandler(
1871 errors, &errorHandler,
1872 "unicodeescape", "end of string in escape sequence",
1873 starts, size, &startinpos, &endinpos, &exc, &s,
1874 (PyObject **)&v, &outpos, &p))
1875 goto onError;
1876 goto nextByte;
1877 }
1878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001879 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001880 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001881 endinpos = (s+i+1)-starts;
1882 if (unicode_decode_call_errorhandler(
1883 errors, &errorHandler,
1884 "unicodeescape", message,
1885 starts, size, &startinpos, &endinpos, &exc, &s,
1886 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001889 }
1890 chr = (chr<<4) & ~0xF;
1891 if (c >= '0' && c <= '9')
1892 chr += c - '0';
1893 else if (c >= 'a' && c <= 'f')
1894 chr += 10 + c - 'a';
1895 else
1896 chr += 10 + c - 'A';
1897 }
1898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 /* _decoding_error will have already written into the
1901 target buffer. */
1902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001904 /* when we get here, chr is a 32-bit unicode character */
1905 if (chr <= 0xffff)
1906 /* UCS-2 character */
1907 *p++ = (Py_UNICODE) chr;
1908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 *p++ = chr;
1913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001914 chr -= 0x10000L;
1915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 endinpos = s-starts;
1920 outpos = p-PyUnicode_AS_UNICODE(v);
1921 if (unicode_decode_call_errorhandler(
1922 errors, &errorHandler,
1923 "unicodeescape", "illegal Unicode character",
1924 starts, size, &startinpos, &endinpos, &exc, &s,
1925 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001926 goto onError;
1927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001928 break;
1929
1930 /* \N{name} */
1931 case 'N':
1932 message = "malformed \\N character escape";
1933 if (ucnhash_CAPI == NULL) {
1934 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001935 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001936 m = PyImport_ImportModule("unicodedata");
1937 if (m == NULL)
1938 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001939 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00001940 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001941 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00001943 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00001944 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00001945 if (ucnhash_CAPI == NULL)
1946 goto ucnhashError;
1947 }
1948 if (*s == '{') {
1949 const char *start = s+1;
1950 /* look for the closing brace */
1951 while (*s != '}' && s < end)
1952 s++;
1953 if (s > start && s < end && *s == '}') {
1954 /* found a name. look it up in the unicode database */
1955 message = "unknown Unicode character name";
1956 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00001957 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001958 goto store;
1959 }
1960 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001961 endinpos = s-starts;
1962 outpos = p-PyUnicode_AS_UNICODE(v);
1963 if (unicode_decode_call_errorhandler(
1964 errors, &errorHandler,
1965 "unicodeescape", message,
1966 starts, size, &startinpos, &endinpos, &exc, &s,
1967 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001968 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001969 break;
1970
1971 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001972 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001973 message = "\\ at end of string";
1974 s--;
1975 endinpos = s-starts;
1976 outpos = p-PyUnicode_AS_UNICODE(v);
1977 if (unicode_decode_call_errorhandler(
1978 errors, &errorHandler,
1979 "unicodeescape", message,
1980 starts, size, &startinpos, &endinpos, &exc, &s,
1981 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001982 goto onError;
1983 }
1984 else {
1985 *p++ = '\\';
1986 *p++ = (unsigned char)s[-1];
1987 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001988 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 nextByte:
1991 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00001993 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001994 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001995 Py_XDECREF(errorHandler);
1996 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001998
Fredrik Lundhccc74732001-02-18 22:13:49 +00001999ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002000 PyErr_SetString(
2001 PyExc_UnicodeError,
2002 "\\N escapes not supported (can't load unicodedata module)"
2003 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002004 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002005 Py_XDECREF(errorHandler);
2006 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002007 return NULL;
2008
Fredrik Lundhccc74732001-02-18 22:13:49 +00002009onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 Py_XDECREF(errorHandler);
2012 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 return NULL;
2014}
2015
2016/* Return a Unicode-Escape string version of the Unicode object.
2017
2018 If quotes is true, the string is enclosed in u"" or u'' quotes as
2019 appropriate.
2020
2021*/
2022
Barry Warsaw51ac5802000-03-20 16:36:48 +00002023static const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002024 Py_ssize_t size,
Barry Warsaw51ac5802000-03-20 16:36:48 +00002025 Py_UNICODE ch);
2026
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027static
2028PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002029 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 int quotes)
2031{
2032 PyObject *repr;
2033 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002035 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036
2037 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2038 if (repr == NULL)
2039 return NULL;
2040
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002041 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042
2043 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002045 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 !findchar(s, size, '"')) ? '"' : '\'';
2047 }
2048 while (size-- > 0) {
2049 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002050
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002051 /* Escape quotes and backslashes */
2052 if ((quotes &&
2053 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 *p++ = '\\';
2055 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002056 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002057 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002058
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002059#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002060 /* Map 21-bit characters to '\U00xxxxxx' */
2061 else if (ch >= 0x10000) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00002062 Py_ssize_t offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002063
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002064 /* Resize the string if necessary */
2065 if (offset + 12 > PyString_GET_SIZE(repr)) {
2066 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002067 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002068 p = PyString_AS_STRING(repr) + offset;
2069 }
2070
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002071 *p++ = '\\';
2072 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002073 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2074 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2075 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2076 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2077 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2078 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2079 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002080 *p++ = hexdigit[ch & 0x0000000F];
2081 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002082 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002083#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002084 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2085 else if (ch >= 0xD800 && ch < 0xDC00) {
2086 Py_UNICODE ch2;
2087 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002088
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002089 ch2 = *s++;
2090 size--;
2091 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2092 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2093 *p++ = '\\';
2094 *p++ = 'U';
2095 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2096 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2097 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2098 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2099 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2100 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2101 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2102 *p++ = hexdigit[ucs & 0x0000000F];
2103 continue;
2104 }
2105 /* Fall through: isolated surrogates are copied as-is */
2106 s--;
2107 size++;
2108 }
2109
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002111 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 *p++ = '\\';
2113 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002114 *p++ = hexdigit[(ch >> 12) & 0x000F];
2115 *p++ = hexdigit[(ch >> 8) & 0x000F];
2116 *p++ = hexdigit[(ch >> 4) & 0x000F];
2117 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002119
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002120 /* Map special whitespace to '\t', \n', '\r' */
2121 else if (ch == '\t') {
2122 *p++ = '\\';
2123 *p++ = 't';
2124 }
2125 else if (ch == '\n') {
2126 *p++ = '\\';
2127 *p++ = 'n';
2128 }
2129 else if (ch == '\r') {
2130 *p++ = '\\';
2131 *p++ = 'r';
2132 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002133
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002134 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002135 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002138 *p++ = hexdigit[(ch >> 4) & 0x000F];
2139 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002140 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002141
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 /* Copy everything else as-is */
2143 else
2144 *p++ = (char) ch;
2145 }
2146 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002147 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002150 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 return repr;
2152}
2153
2154PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002155 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156{
2157 return unicodeescape_string(s, size, 0);
2158}
2159
2160PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2161{
2162 if (!PyUnicode_Check(unicode)) {
2163 PyErr_BadArgument();
2164 return NULL;
2165 }
2166 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2167 PyUnicode_GET_SIZE(unicode));
2168}
2169
2170/* --- Raw Unicode Escape Codec ------------------------------------------- */
2171
2172PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002173 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 const char *errors)
2175{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002177 Py_ssize_t startinpos;
2178 Py_ssize_t endinpos;
2179 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 const char *end;
2183 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 PyObject *errorHandler = NULL;
2185 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002186
Guido van Rossumd57fd912000-03-10 22:53:23 +00002187 /* Escaped strings will always be longer than the resulting
2188 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002189 length after conversion to the true value. (But decoding error
2190 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191 v = _PyUnicode_New(size);
2192 if (v == NULL)
2193 goto onError;
2194 if (size == 0)
2195 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002196 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 end = s + size;
2198 while (s < end) {
2199 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002200 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002202 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203
2204 /* Non-escape characters are interpreted as Unicode ordinals */
2205 if (*s != '\\') {
2206 *p++ = (unsigned char)*s++;
2207 continue;
2208 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002209 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210
2211 /* \u-escapes are only interpreted iff the number of leading
2212 backslashes if odd */
2213 bs = s;
2214 for (;s < end;) {
2215 if (*s != '\\')
2216 break;
2217 *p++ = (unsigned char)*s++;
2218 }
2219 if (((s - bs) & 1) == 0 ||
2220 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002221 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 continue;
2223 }
2224 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002225 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226 s++;
2227
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002228 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002230 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002231 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 endinpos = s-starts;
2234 if (unicode_decode_call_errorhandler(
2235 errors, &errorHandler,
2236 "rawunicodeescape", "truncated \\uXXXX",
2237 starts, size, &startinpos, &endinpos, &exc, &s,
2238 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 }
2242 x = (x<<4) & ~0xF;
2243 if (c >= '0' && c <= '9')
2244 x += c - '0';
2245 else if (c >= 'a' && c <= 'f')
2246 x += 10 + c - 'a';
2247 else
2248 x += 10 + c - 'A';
2249 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002250#ifndef Py_UNICODE_WIDE
2251 if (x > 0x10000) {
2252 if (unicode_decode_call_errorhandler(
2253 errors, &errorHandler,
2254 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2255 starts, size, &startinpos, &endinpos, &exc, &s,
2256 (PyObject **)&v, &outpos, &p))
2257 goto onError;
2258 }
2259#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002260 *p++ = x;
2261 nextByte:
2262 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002264 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002265 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 Py_XDECREF(errorHandler);
2267 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002269
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 onError:
2271 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_XDECREF(errorHandler);
2273 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274 return NULL;
2275}
2276
2277PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002278 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279{
2280 PyObject *repr;
2281 char *p;
2282 char *q;
2283
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002284 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002286#ifdef Py_UNICODE_WIDE
2287 repr = PyString_FromStringAndSize(NULL, 10 * size);
2288#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002290#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 if (repr == NULL)
2292 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002293 if (size == 0)
2294 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295
2296 p = q = PyString_AS_STRING(repr);
2297 while (size-- > 0) {
2298 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002299#ifdef Py_UNICODE_WIDE
2300 /* Map 32-bit characters to '\Uxxxxxxxx' */
2301 if (ch >= 0x10000) {
2302 *p++ = '\\';
2303 *p++ = 'U';
2304 *p++ = hexdigit[(ch >> 28) & 0xf];
2305 *p++ = hexdigit[(ch >> 24) & 0xf];
2306 *p++ = hexdigit[(ch >> 20) & 0xf];
2307 *p++ = hexdigit[(ch >> 16) & 0xf];
2308 *p++ = hexdigit[(ch >> 12) & 0xf];
2309 *p++ = hexdigit[(ch >> 8) & 0xf];
2310 *p++ = hexdigit[(ch >> 4) & 0xf];
2311 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002312 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002313 else
2314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 /* Map 16-bit characters to '\uxxxx' */
2316 if (ch >= 256) {
2317 *p++ = '\\';
2318 *p++ = 'u';
2319 *p++ = hexdigit[(ch >> 12) & 0xf];
2320 *p++ = hexdigit[(ch >> 8) & 0xf];
2321 *p++ = hexdigit[(ch >> 4) & 0xf];
2322 *p++ = hexdigit[ch & 15];
2323 }
2324 /* Copy everything else as-is */
2325 else
2326 *p++ = (char) ch;
2327 }
2328 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002329 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 return repr;
2331}
2332
2333PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2334{
2335 if (!PyUnicode_Check(unicode)) {
2336 PyErr_BadArgument();
2337 return NULL;
2338 }
2339 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2340 PyUnicode_GET_SIZE(unicode));
2341}
2342
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002343/* --- Unicode Internal Codec ------------------------------------------- */
2344
2345PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002346 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002347 const char *errors)
2348{
2349 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002350 Py_ssize_t startinpos;
2351 Py_ssize_t endinpos;
2352 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002353 PyUnicodeObject *v;
2354 Py_UNICODE *p;
2355 const char *end;
2356 const char *reason;
2357 PyObject *errorHandler = NULL;
2358 PyObject *exc = NULL;
2359
Neal Norwitzd43069c2006-01-08 01:12:10 +00002360#ifdef Py_UNICODE_WIDE
2361 Py_UNICODE unimax = PyUnicode_GetMax();
2362#endif
2363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002364 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2365 if (v == NULL)
2366 goto onError;
2367 if (PyUnicode_GetSize((PyObject *)v) == 0)
2368 return (PyObject *)v;
2369 p = PyUnicode_AS_UNICODE(v);
2370 end = s + size;
2371
2372 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002373 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002374 /* We have to sanity check the raw data, otherwise doom looms for
2375 some malformed UCS-4 data. */
2376 if (
2377 #ifdef Py_UNICODE_WIDE
2378 *p > unimax || *p < 0 ||
2379 #endif
2380 end-s < Py_UNICODE_SIZE
2381 )
2382 {
2383 startinpos = s - starts;
2384 if (end-s < Py_UNICODE_SIZE) {
2385 endinpos = end-starts;
2386 reason = "truncated input";
2387 }
2388 else {
2389 endinpos = s - starts + Py_UNICODE_SIZE;
2390 reason = "illegal code point (> 0x10FFFF)";
2391 }
2392 outpos = p - PyUnicode_AS_UNICODE(v);
2393 if (unicode_decode_call_errorhandler(
2394 errors, &errorHandler,
2395 "unicode_internal", reason,
2396 starts, size, &startinpos, &endinpos, &exc, &s,
2397 (PyObject **)&v, &outpos, &p)) {
2398 goto onError;
2399 }
2400 }
2401 else {
2402 p++;
2403 s += Py_UNICODE_SIZE;
2404 }
2405 }
2406
Martin v. Löwis412fb672006-04-13 06:34:32 +00002407 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002408 goto onError;
2409 Py_XDECREF(errorHandler);
2410 Py_XDECREF(exc);
2411 return (PyObject *)v;
2412
2413 onError:
2414 Py_XDECREF(v);
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return NULL;
2418}
2419
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420/* --- Latin-1 Codec ------------------------------------------------------ */
2421
2422PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002423 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 const char *errors)
2425{
2426 PyUnicodeObject *v;
2427 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002428
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002430 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002431 Py_UNICODE r = *(unsigned char*)s;
2432 return PyUnicode_FromUnicode(&r, 1);
2433 }
2434
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 v = _PyUnicode_New(size);
2436 if (v == NULL)
2437 goto onError;
2438 if (size == 0)
2439 return (PyObject *)v;
2440 p = PyUnicode_AS_UNICODE(v);
2441 while (size-- > 0)
2442 *p++ = (unsigned char)*s++;
2443 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002444
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 onError:
2446 Py_XDECREF(v);
2447 return NULL;
2448}
2449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002450/* create or adjust a UnicodeEncodeError */
2451static void make_encode_exception(PyObject **exceptionObject,
2452 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002453 const Py_UNICODE *unicode, Py_ssize_t size,
2454 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002455 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002457 if (*exceptionObject == NULL) {
2458 *exceptionObject = PyUnicodeEncodeError_Create(
2459 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 }
2461 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002462 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2463 goto onError;
2464 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2465 goto onError;
2466 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2467 goto onError;
2468 return;
2469 onError:
2470 Py_DECREF(*exceptionObject);
2471 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 }
2473}
2474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475/* raises a UnicodeEncodeError */
2476static void raise_encode_exception(PyObject **exceptionObject,
2477 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002478 const Py_UNICODE *unicode, Py_ssize_t size,
2479 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 const char *reason)
2481{
2482 make_encode_exception(exceptionObject,
2483 encoding, unicode, size, startpos, endpos, reason);
2484 if (*exceptionObject != NULL)
2485 PyCodec_StrictErrors(*exceptionObject);
2486}
2487
2488/* error handling callback helper:
2489 build arguments, call the callback and check the arguments,
2490 put the result into newpos and return the replacement string, which
2491 has to be freed by the caller */
2492static PyObject *unicode_encode_call_errorhandler(const char *errors,
2493 PyObject **errorHandler,
2494 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002495 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2496 Py_ssize_t startpos, Py_ssize_t endpos,
2497 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002499 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500
2501 PyObject *restuple;
2502 PyObject *resunicode;
2503
2504 if (*errorHandler == NULL) {
2505 *errorHandler = PyCodec_LookupError(errors);
2506 if (*errorHandler == NULL)
2507 return NULL;
2508 }
2509
2510 make_encode_exception(exceptionObject,
2511 encoding, unicode, size, startpos, endpos, reason);
2512 if (*exceptionObject == NULL)
2513 return NULL;
2514
2515 restuple = PyObject_CallFunctionObjArgs(
2516 *errorHandler, *exceptionObject, NULL);
2517 if (restuple == NULL)
2518 return NULL;
2519 if (!PyTuple_Check(restuple)) {
2520 PyErr_Format(PyExc_TypeError, &argparse[4]);
2521 Py_DECREF(restuple);
2522 return NULL;
2523 }
2524 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2525 &resunicode, newpos)) {
2526 Py_DECREF(restuple);
2527 return NULL;
2528 }
2529 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002530 *newpos = size+*newpos;
2531 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002532 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002533 Py_DECREF(restuple);
2534 return NULL;
2535 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002536 Py_INCREF(resunicode);
2537 Py_DECREF(restuple);
2538 return resunicode;
2539}
2540
2541static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002542 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002543 const char *errors,
2544 int limit)
2545{
2546 /* output object */
2547 PyObject *res;
2548 /* pointers to the beginning and end+1 of input */
2549 const Py_UNICODE *startp = p;
2550 const Py_UNICODE *endp = p + size;
2551 /* pointer to the beginning of the unencodable characters */
2552 /* const Py_UNICODE *badp = NULL; */
2553 /* pointer into the output */
2554 char *str;
2555 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002556 Py_ssize_t respos = 0;
2557 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002558 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2559 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 PyObject *errorHandler = NULL;
2561 PyObject *exc = NULL;
2562 /* the following variable is used for caching string comparisons
2563 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2564 int known_errorHandler = -1;
2565
2566 /* allocate enough for a simple encoding without
2567 replacements, if we need more, we'll resize */
2568 res = PyString_FromStringAndSize(NULL, size);
2569 if (res == NULL)
2570 goto onError;
2571 if (size == 0)
2572 return res;
2573 str = PyString_AS_STRING(res);
2574 ressize = size;
2575
2576 while (p<endp) {
2577 Py_UNICODE c = *p;
2578
2579 /* can we encode this? */
2580 if (c<limit) {
2581 /* no overflow check, because we know that the space is enough */
2582 *str++ = (char)c;
2583 ++p;
2584 }
2585 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002586 Py_ssize_t unicodepos = p-startp;
2587 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002588 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002589 Py_ssize_t repsize;
2590 Py_ssize_t newpos;
2591 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 Py_UNICODE *uni2;
2593 /* startpos for collecting unencodable chars */
2594 const Py_UNICODE *collstart = p;
2595 const Py_UNICODE *collend = p;
2596 /* find all unecodable characters */
2597 while ((collend < endp) && ((*collend)>=limit))
2598 ++collend;
2599 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2600 if (known_errorHandler==-1) {
2601 if ((errors==NULL) || (!strcmp(errors, "strict")))
2602 known_errorHandler = 1;
2603 else if (!strcmp(errors, "replace"))
2604 known_errorHandler = 2;
2605 else if (!strcmp(errors, "ignore"))
2606 known_errorHandler = 3;
2607 else if (!strcmp(errors, "xmlcharrefreplace"))
2608 known_errorHandler = 4;
2609 else
2610 known_errorHandler = 0;
2611 }
2612 switch (known_errorHandler) {
2613 case 1: /* strict */
2614 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2615 goto onError;
2616 case 2: /* replace */
2617 while (collstart++<collend)
2618 *str++ = '?'; /* fall through */
2619 case 3: /* ignore */
2620 p = collend;
2621 break;
2622 case 4: /* xmlcharrefreplace */
2623 respos = str-PyString_AS_STRING(res);
2624 /* determine replacement size (temporarily (mis)uses p) */
2625 for (p = collstart, repsize = 0; p < collend; ++p) {
2626 if (*p<10)
2627 repsize += 2+1+1;
2628 else if (*p<100)
2629 repsize += 2+2+1;
2630 else if (*p<1000)
2631 repsize += 2+3+1;
2632 else if (*p<10000)
2633 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002634#ifndef Py_UNICODE_WIDE
2635 else
2636 repsize += 2+5+1;
2637#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002638 else if (*p<100000)
2639 repsize += 2+5+1;
2640 else if (*p<1000000)
2641 repsize += 2+6+1;
2642 else
2643 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002644#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002645 }
2646 requiredsize = respos+repsize+(endp-collend);
2647 if (requiredsize > ressize) {
2648 if (requiredsize<2*ressize)
2649 requiredsize = 2*ressize;
2650 if (_PyString_Resize(&res, requiredsize))
2651 goto onError;
2652 str = PyString_AS_STRING(res) + respos;
2653 ressize = requiredsize;
2654 }
2655 /* generate replacement (temporarily (mis)uses p) */
2656 for (p = collstart; p < collend; ++p) {
2657 str += sprintf(str, "&#%d;", (int)*p);
2658 }
2659 p = collend;
2660 break;
2661 default:
2662 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2663 encoding, reason, startp, size, &exc,
2664 collstart-startp, collend-startp, &newpos);
2665 if (repunicode == NULL)
2666 goto onError;
2667 /* need more space? (at least enough for what we
2668 have+the replacement+the rest of the string, so
2669 we won't have to check space for encodable characters) */
2670 respos = str-PyString_AS_STRING(res);
2671 repsize = PyUnicode_GET_SIZE(repunicode);
2672 requiredsize = respos+repsize+(endp-collend);
2673 if (requiredsize > ressize) {
2674 if (requiredsize<2*ressize)
2675 requiredsize = 2*ressize;
2676 if (_PyString_Resize(&res, requiredsize)) {
2677 Py_DECREF(repunicode);
2678 goto onError;
2679 }
2680 str = PyString_AS_STRING(res) + respos;
2681 ressize = requiredsize;
2682 }
2683 /* check if there is anything unencodable in the replacement
2684 and copy it to the output */
2685 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2686 c = *uni2;
2687 if (c >= limit) {
2688 raise_encode_exception(&exc, encoding, startp, size,
2689 unicodepos, unicodepos+1, reason);
2690 Py_DECREF(repunicode);
2691 goto onError;
2692 }
2693 *str = (char)c;
2694 }
2695 p = startp + newpos;
2696 Py_DECREF(repunicode);
2697 }
2698 }
2699 }
2700 /* Resize if we allocated to much */
2701 respos = str-PyString_AS_STRING(res);
2702 if (respos<ressize)
2703 /* If this falls res will be NULL */
2704 _PyString_Resize(&res, respos);
2705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
2707 return res;
2708
2709 onError:
2710 Py_XDECREF(res);
2711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
2713 return NULL;
2714}
2715
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002717 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 const char *errors)
2719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002720 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721}
2722
2723PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2724{
2725 if (!PyUnicode_Check(unicode)) {
2726 PyErr_BadArgument();
2727 return NULL;
2728 }
2729 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2730 PyUnicode_GET_SIZE(unicode),
2731 NULL);
2732}
2733
2734/* --- 7-bit ASCII Codec -------------------------------------------------- */
2735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002737 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 const char *errors)
2739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 PyUnicodeObject *v;
2742 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002743 Py_ssize_t startinpos;
2744 Py_ssize_t endinpos;
2745 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *e;
2747 PyObject *errorHandler = NULL;
2748 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002749
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002751 if (size == 1 && *(unsigned char*)s < 128) {
2752 Py_UNICODE r = *(unsigned char*)s;
2753 return PyUnicode_FromUnicode(&r, 1);
2754 }
Tim Petersced69f82003-09-16 20:30:58 +00002755
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 v = _PyUnicode_New(size);
2757 if (v == NULL)
2758 goto onError;
2759 if (size == 0)
2760 return (PyObject *)v;
2761 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002762 e = s + size;
2763 while (s < e) {
2764 register unsigned char c = (unsigned char)*s;
2765 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 ++s;
2768 }
2769 else {
2770 startinpos = s-starts;
2771 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002772 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002773 if (unicode_decode_call_errorhandler(
2774 errors, &errorHandler,
2775 "ascii", "ordinal not in range(128)",
2776 starts, size, &startinpos, &endinpos, &exc, &s,
2777 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002781 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00002782 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002783 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 Py_XDECREF(errorHandler);
2785 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002787
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 onError:
2789 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return NULL;
2793}
2794
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002796 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 const char *errors)
2798{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002799 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800}
2801
2802PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2803{
2804 if (!PyUnicode_Check(unicode)) {
2805 PyErr_BadArgument();
2806 return NULL;
2807 }
2808 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2809 PyUnicode_GET_SIZE(unicode),
2810 NULL);
2811}
2812
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002813#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002814
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002815/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002816
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002817PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002818 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002819 const char *errors)
2820{
2821 PyUnicodeObject *v;
2822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002823 DWORD usize;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002824
2825 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002826 assert(size < INT_MAX);
2827 usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002828 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002829 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2830
2831 v = _PyUnicode_New(usize);
2832 if (v == NULL)
2833 return NULL;
2834 if (usize == 0)
2835 return (PyObject *)v;
2836 p = PyUnicode_AS_UNICODE(v);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002837 if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002838 Py_DECREF(v);
2839 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2840 }
2841
2842 return (PyObject *)v;
2843}
2844
2845PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002846 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002847 const char *errors)
2848{
2849 PyObject *repr;
2850 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002851 DWORD mbcssize;
2852
2853 /* If there are no characters, bail now! */
2854 if (size==0)
2855 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002856
2857 /* First get the size of the result */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002858 assert(size<INT_MAX);
2859 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002860 if (mbcssize==0)
2861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2862
2863 repr = PyString_FromStringAndSize(NULL, mbcssize);
2864 if (repr == NULL)
2865 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002866 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002867 return repr;
2868
2869 /* Do the conversion */
2870 s = PyString_AS_STRING(repr);
Martin v. Löwis18e16552006-02-15 17:27:45 +00002871 assert(size < INT_MAX);
2872 if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002873 Py_DECREF(repr);
2874 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2875 }
2876 return repr;
2877}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002878
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002879PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2880{
2881 if (!PyUnicode_Check(unicode)) {
2882 PyErr_BadArgument();
2883 return NULL;
2884 }
2885 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2886 PyUnicode_GET_SIZE(unicode),
2887 NULL);
2888}
2889
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002890#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002891
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892/* --- Character Mapping Codec -------------------------------------------- */
2893
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002895 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896 PyObject *mapping,
2897 const char *errors)
2898{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002899 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002900 Py_ssize_t startinpos;
2901 Py_ssize_t endinpos;
2902 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 PyUnicodeObject *v;
2905 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 PyObject *errorHandler = NULL;
2908 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002909 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002910 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002911
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 /* Default to Latin-1 */
2913 if (mapping == NULL)
2914 return PyUnicode_DecodeLatin1(s, size, errors);
2915
2916 v = _PyUnicode_New(size);
2917 if (v == NULL)
2918 goto onError;
2919 if (size == 0)
2920 return (PyObject *)v;
2921 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002922 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002923 if (PyUnicode_CheckExact(mapping)) {
2924 mapstring = PyUnicode_AS_UNICODE(mapping);
2925 maplen = PyUnicode_GET_SIZE(mapping);
2926 while (s < e) {
2927 unsigned char ch = *s;
2928 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002930 if (ch < maplen)
2931 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002933 if (x == 0xfffe) {
2934 /* undefined mapping */
2935 outpos = p-PyUnicode_AS_UNICODE(v);
2936 startinpos = s-starts;
2937 endinpos = startinpos+1;
2938 if (unicode_decode_call_errorhandler(
2939 errors, &errorHandler,
2940 "charmap", "character maps to <undefined>",
2941 starts, size, &startinpos, &endinpos, &exc, &s,
2942 (PyObject **)&v, &outpos, &p)) {
2943 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002944 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002945 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002946 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002947 *p++ = x;
2948 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002950 }
2951 else {
2952 while (s < e) {
2953 unsigned char ch = *s;
2954 PyObject *w, *x;
2955
2956 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2957 w = PyInt_FromLong((long)ch);
2958 if (w == NULL)
2959 goto onError;
2960 x = PyObject_GetItem(mapping, w);
2961 Py_DECREF(w);
2962 if (x == NULL) {
2963 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2964 /* No mapping found means: mapping is undefined. */
2965 PyErr_Clear();
2966 x = Py_None;
2967 Py_INCREF(x);
2968 } else
2969 goto onError;
2970 }
2971
2972 /* Apply mapping */
2973 if (PyInt_Check(x)) {
2974 long value = PyInt_AS_LONG(x);
2975 if (value < 0 || value > 65535) {
2976 PyErr_SetString(PyExc_TypeError,
2977 "character mapping must be in range(65536)");
2978 Py_DECREF(x);
2979 goto onError;
2980 }
2981 *p++ = (Py_UNICODE)value;
2982 }
2983 else if (x == Py_None) {
2984 /* undefined mapping */
2985 outpos = p-PyUnicode_AS_UNICODE(v);
2986 startinpos = s-starts;
2987 endinpos = startinpos+1;
2988 if (unicode_decode_call_errorhandler(
2989 errors, &errorHandler,
2990 "charmap", "character maps to <undefined>",
2991 starts, size, &startinpos, &endinpos, &exc, &s,
2992 (PyObject **)&v, &outpos, &p)) {
2993 Py_DECREF(x);
2994 goto onError;
2995 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002996 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002997 continue;
2998 }
2999 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003000 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003001
3002 if (targetsize == 1)
3003 /* 1-1 mapping */
3004 *p++ = *PyUnicode_AS_UNICODE(x);
3005
3006 else if (targetsize > 1) {
3007 /* 1-n mapping */
3008 if (targetsize > extrachars) {
3009 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003010 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3011 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003012 (targetsize << 2);
3013 extrachars += needed;
3014 if (_PyUnicode_Resize(&v,
3015 PyUnicode_GET_SIZE(v) + needed) < 0) {
3016 Py_DECREF(x);
3017 goto onError;
3018 }
3019 p = PyUnicode_AS_UNICODE(v) + oldpos;
3020 }
3021 Py_UNICODE_COPY(p,
3022 PyUnicode_AS_UNICODE(x),
3023 targetsize);
3024 p += targetsize;
3025 extrachars -= targetsize;
3026 }
3027 /* 1-0 mapping: skip the character */
3028 }
3029 else {
3030 /* wrong return value */
3031 PyErr_SetString(PyExc_TypeError,
3032 "character mapping must return integer, None or unicode");
3033 Py_DECREF(x);
3034 goto onError;
3035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003037 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
3040 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003041 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003042 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003043 Py_XDECREF(errorHandler);
3044 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003046
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003048 Py_XDECREF(errorHandler);
3049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_XDECREF(v);
3051 return NULL;
3052}
3053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054/* Lookup the character ch in the mapping. If the character
3055 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003056 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003059 PyObject *w = PyInt_FromLong((long)c);
3060 PyObject *x;
3061
3062 if (w == NULL)
3063 return NULL;
3064 x = PyObject_GetItem(mapping, w);
3065 Py_DECREF(w);
3066 if (x == NULL) {
3067 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3068 /* No mapping found means: mapping is undefined. */
3069 PyErr_Clear();
3070 x = Py_None;
3071 Py_INCREF(x);
3072 return x;
3073 } else
3074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003076 else if (x == Py_None)
3077 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 else if (PyInt_Check(x)) {
3079 long value = PyInt_AS_LONG(x);
3080 if (value < 0 || value > 255) {
3081 PyErr_SetString(PyExc_TypeError,
3082 "character mapping must be in range(256)");
3083 Py_DECREF(x);
3084 return NULL;
3085 }
3086 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 else if (PyString_Check(x))
3089 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091 /* wrong return value */
3092 PyErr_SetString(PyExc_TypeError,
3093 "character mapping must return integer, None or str");
3094 Py_DECREF(x);
3095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
3097}
3098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099/* lookup the character, put the result in the output string and adjust
3100 various state variables. Reallocate the output string if not enough
3101 space is available. Return a new reference to the object that
3102 was put in the output buffer, or Py_None, if the mapping was undefined
3103 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003104 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105static
3106PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003107 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108{
3109 PyObject *rep = charmapencode_lookup(c, mapping);
3110
3111 if (rep==NULL)
3112 return NULL;
3113 else if (rep==Py_None)
3114 return rep;
3115 else {
3116 char *outstart = PyString_AS_STRING(*outobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003117 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003119 Py_ssize_t requiredsize = *outpos+1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 if (outsize<requiredsize) {
3121 /* exponentially overallocate to minimize reallocations */
3122 if (requiredsize < 2*outsize)
3123 requiredsize = 2*outsize;
3124 if (_PyString_Resize(outobj, requiredsize)) {
3125 Py_DECREF(rep);
3126 return NULL;
3127 }
3128 outstart = PyString_AS_STRING(*outobj);
3129 }
3130 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3131 }
3132 else {
3133 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003134 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3135 Py_ssize_t requiredsize = *outpos+repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003136 if (outsize<requiredsize) {
3137 /* exponentially overallocate to minimize reallocations */
3138 if (requiredsize < 2*outsize)
3139 requiredsize = 2*outsize;
3140 if (_PyString_Resize(outobj, requiredsize)) {
3141 Py_DECREF(rep);
3142 return NULL;
3143 }
3144 outstart = PyString_AS_STRING(*outobj);
3145 }
3146 memcpy(outstart + *outpos, repchars, repsize);
3147 *outpos += repsize;
3148 }
3149 }
3150 return rep;
3151}
3152
3153/* handle an error in PyUnicode_EncodeCharmap
3154 Return 0 on success, -1 on error */
3155static
3156int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003157 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003159 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003160 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003161{
3162 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003163 Py_ssize_t repsize;
3164 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 Py_UNICODE *uni2;
3166 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003167 Py_ssize_t collstartpos = *inpos;
3168 Py_ssize_t collendpos = *inpos+1;
3169 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 char *encoding = "charmap";
3171 char *reason = "character maps to <undefined>";
3172
3173 PyObject *x;
3174 /* find all unencodable characters */
3175 while (collendpos < size) {
3176 x = charmapencode_lookup(p[collendpos], mapping);
3177 if (x==NULL)
3178 return -1;
3179 else if (x!=Py_None) {
3180 Py_DECREF(x);
3181 break;
3182 }
3183 Py_DECREF(x);
3184 ++collendpos;
3185 }
3186 /* cache callback name lookup
3187 * (if not done yet, i.e. it's the first error) */
3188 if (*known_errorHandler==-1) {
3189 if ((errors==NULL) || (!strcmp(errors, "strict")))
3190 *known_errorHandler = 1;
3191 else if (!strcmp(errors, "replace"))
3192 *known_errorHandler = 2;
3193 else if (!strcmp(errors, "ignore"))
3194 *known_errorHandler = 3;
3195 else if (!strcmp(errors, "xmlcharrefreplace"))
3196 *known_errorHandler = 4;
3197 else
3198 *known_errorHandler = 0;
3199 }
3200 switch (*known_errorHandler) {
3201 case 1: /* strict */
3202 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3203 return -1;
3204 case 2: /* replace */
3205 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3206 x = charmapencode_output('?', mapping, res, respos);
3207 if (x==NULL) {
3208 return -1;
3209 }
3210 else if (x==Py_None) {
3211 Py_DECREF(x);
3212 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3213 return -1;
3214 }
3215 Py_DECREF(x);
3216 }
3217 /* fall through */
3218 case 3: /* ignore */
3219 *inpos = collendpos;
3220 break;
3221 case 4: /* xmlcharrefreplace */
3222 /* generate replacement (temporarily (mis)uses p) */
3223 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3224 char buffer[2+29+1+1];
3225 char *cp;
3226 sprintf(buffer, "&#%d;", (int)p[collpos]);
3227 for (cp = buffer; *cp; ++cp) {
3228 x = charmapencode_output(*cp, mapping, res, respos);
3229 if (x==NULL)
3230 return -1;
3231 else if (x==Py_None) {
3232 Py_DECREF(x);
3233 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3234 return -1;
3235 }
3236 Py_DECREF(x);
3237 }
3238 }
3239 *inpos = collendpos;
3240 break;
3241 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003242 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003243 encoding, reason, p, size, exceptionObject,
3244 collstartpos, collendpos, &newpos);
3245 if (repunicode == NULL)
3246 return -1;
3247 /* generate replacement */
3248 repsize = PyUnicode_GET_SIZE(repunicode);
3249 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3250 x = charmapencode_output(*uni2, mapping, res, respos);
3251 if (x==NULL) {
3252 Py_DECREF(repunicode);
3253 return -1;
3254 }
3255 else if (x==Py_None) {
3256 Py_DECREF(repunicode);
3257 Py_DECREF(x);
3258 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3259 return -1;
3260 }
3261 Py_DECREF(x);
3262 }
3263 *inpos = newpos;
3264 Py_DECREF(repunicode);
3265 }
3266 return 0;
3267}
3268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 PyObject *mapping,
3272 const char *errors)
3273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 /* output object */
3275 PyObject *res = NULL;
3276 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003277 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003279 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 PyObject *errorHandler = NULL;
3281 PyObject *exc = NULL;
3282 /* the following variable is used for caching string comparisons
3283 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3284 * 3=ignore, 4=xmlcharrefreplace */
3285 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286
3287 /* Default to Latin-1 */
3288 if (mapping == NULL)
3289 return PyUnicode_EncodeLatin1(p, size, errors);
3290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 /* allocate enough for a simple encoding without
3292 replacements, if we need more, we'll resize */
3293 res = PyString_FromStringAndSize(NULL, size);
3294 if (res == NULL)
3295 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003296 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003297 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 while (inpos<size) {
3300 /* try to encode it */
3301 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3302 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 if (x==Py_None) { /* unencodable character */
3305 if (charmap_encoding_error(p, size, &inpos, mapping,
3306 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003307 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003308 &res, &respos)) {
3309 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003310 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 else
3314 /* done with this character => adjust input position */
3315 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 Py_DECREF(x);
3317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 /* Resize if we allocated to much */
3320 if (respos<PyString_GET_SIZE(res)) {
3321 if (_PyString_Resize(&res, respos))
3322 goto onError;
3323 }
3324 Py_XDECREF(exc);
3325 Py_XDECREF(errorHandler);
3326 return res;
3327
3328 onError:
3329 Py_XDECREF(res);
3330 Py_XDECREF(exc);
3331 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 return NULL;
3333}
3334
3335PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3336 PyObject *mapping)
3337{
3338 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3339 PyErr_BadArgument();
3340 return NULL;
3341 }
3342 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3343 PyUnicode_GET_SIZE(unicode),
3344 mapping,
3345 NULL);
3346}
3347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348/* create or adjust a UnicodeTranslateError */
3349static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003350 const Py_UNICODE *unicode, Py_ssize_t size,
3351 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003352 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 if (*exceptionObject == NULL) {
3355 *exceptionObject = PyUnicodeTranslateError_Create(
3356 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 }
3358 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3360 goto onError;
3361 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3362 goto onError;
3363 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3364 goto onError;
3365 return;
3366 onError:
3367 Py_DECREF(*exceptionObject);
3368 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 }
3370}
3371
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372/* raises a UnicodeTranslateError */
3373static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003374 const Py_UNICODE *unicode, Py_ssize_t size,
3375 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 const char *reason)
3377{
3378 make_translate_exception(exceptionObject,
3379 unicode, size, startpos, endpos, reason);
3380 if (*exceptionObject != NULL)
3381 PyCodec_StrictErrors(*exceptionObject);
3382}
3383
3384/* error handling callback helper:
3385 build arguments, call the callback and check the arguments,
3386 put the result into newpos and return the replacement string, which
3387 has to be freed by the caller */
3388static PyObject *unicode_translate_call_errorhandler(const char *errors,
3389 PyObject **errorHandler,
3390 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003391 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3392 Py_ssize_t startpos, Py_ssize_t endpos,
3393 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003394{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003395 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396
Martin v. Löwis412fb672006-04-13 06:34:32 +00003397 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 PyObject *restuple;
3399 PyObject *resunicode;
3400
3401 if (*errorHandler == NULL) {
3402 *errorHandler = PyCodec_LookupError(errors);
3403 if (*errorHandler == NULL)
3404 return NULL;
3405 }
3406
3407 make_translate_exception(exceptionObject,
3408 unicode, size, startpos, endpos, reason);
3409 if (*exceptionObject == NULL)
3410 return NULL;
3411
3412 restuple = PyObject_CallFunctionObjArgs(
3413 *errorHandler, *exceptionObject, NULL);
3414 if (restuple == NULL)
3415 return NULL;
3416 if (!PyTuple_Check(restuple)) {
3417 PyErr_Format(PyExc_TypeError, &argparse[4]);
3418 Py_DECREF(restuple);
3419 return NULL;
3420 }
3421 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 Py_DECREF(restuple);
3424 return NULL;
3425 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00003426 if (i_newpos<0)
3427 *newpos = size+i_newpos;
3428 else
3429 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003430 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003431 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003432 Py_DECREF(restuple);
3433 return NULL;
3434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 Py_INCREF(resunicode);
3436 Py_DECREF(restuple);
3437 return resunicode;
3438}
3439
3440/* Lookup the character ch in the mapping and put the result in result,
3441 which must be decrefed by the caller.
3442 Return 0 on success, -1 on error */
3443static
3444int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3445{
3446 PyObject *w = PyInt_FromLong((long)c);
3447 PyObject *x;
3448
3449 if (w == NULL)
3450 return -1;
3451 x = PyObject_GetItem(mapping, w);
3452 Py_DECREF(w);
3453 if (x == NULL) {
3454 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3455 /* No mapping found means: use 1:1 mapping. */
3456 PyErr_Clear();
3457 *result = NULL;
3458 return 0;
3459 } else
3460 return -1;
3461 }
3462 else if (x == Py_None) {
3463 *result = x;
3464 return 0;
3465 }
3466 else if (PyInt_Check(x)) {
3467 long value = PyInt_AS_LONG(x);
3468 long max = PyUnicode_GetMax();
3469 if (value < 0 || value > max) {
3470 PyErr_Format(PyExc_TypeError,
3471 "character mapping must be in range(0x%lx)", max+1);
3472 Py_DECREF(x);
3473 return -1;
3474 }
3475 *result = x;
3476 return 0;
3477 }
3478 else if (PyUnicode_Check(x)) {
3479 *result = x;
3480 return 0;
3481 }
3482 else {
3483 /* wrong return value */
3484 PyErr_SetString(PyExc_TypeError,
3485 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003486 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 return -1;
3488 }
3489}
3490/* ensure that *outobj is at least requiredsize characters long,
3491if not reallocate and adjust various state variables.
3492Return 0 on success, -1 on error */
3493static
Walter Dörwald4894c302003-10-24 14:25:28 +00003494int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003495 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003497 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00003498 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003500 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003502 if (requiredsize < 2 * oldsize)
3503 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003504 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 return -1;
3506 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 }
3508 return 0;
3509}
3510/* lookup the character, put the result in the output string and adjust
3511 various state variables. Return a new reference to the object that
3512 was put in the output buffer in *result, or Py_None, if the mapping was
3513 undefined (in which case no character was written).
3514 The called must decref result.
3515 Return 0 on success, -1 on error. */
3516static
Walter Dörwald4894c302003-10-24 14:25:28 +00003517int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00003519 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520{
Walter Dörwald4894c302003-10-24 14:25:28 +00003521 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 return -1;
3523 if (*res==NULL) {
3524 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003525 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 }
3527 else if (*res==Py_None)
3528 ;
3529 else if (PyInt_Check(*res)) {
3530 /* no overflow check, because we know that the space is enough */
3531 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3532 }
3533 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003534 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (repsize==1) {
3536 /* no overflow check, because we know that the space is enough */
3537 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3538 }
3539 else if (repsize!=0) {
3540 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003541 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003542 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003543 repsize - 1;
3544 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 return -1;
3546 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3547 *outp += repsize;
3548 }
3549 }
3550 else
3551 return -1;
3552 return 0;
3553}
3554
3555PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003556 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 PyObject *mapping,
3558 const char *errors)
3559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 /* output object */
3561 PyObject *res = NULL;
3562 /* pointers to the beginning and end+1 of input */
3563 const Py_UNICODE *startp = p;
3564 const Py_UNICODE *endp = p + size;
3565 /* pointer into the output */
3566 Py_UNICODE *str;
3567 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003568 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 char *reason = "character maps to <undefined>";
3570 PyObject *errorHandler = NULL;
3571 PyObject *exc = NULL;
3572 /* the following variable is used for caching string comparisons
3573 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3574 * 3=ignore, 4=xmlcharrefreplace */
3575 int known_errorHandler = -1;
3576
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 if (mapping == NULL) {
3578 PyErr_BadArgument();
3579 return NULL;
3580 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581
3582 /* allocate enough for a simple 1:1 translation without
3583 replacements, if we need more, we'll resize */
3584 res = PyUnicode_FromUnicode(NULL, size);
3585 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003586 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 return res;
3589 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 while (p<endp) {
3592 /* try to encode it */
3593 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003594 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 goto onError;
3597 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003598 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 if (x!=Py_None) /* it worked => adjust input pointer */
3600 ++p;
3601 else { /* untranslatable character */
3602 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003603 Py_ssize_t repsize;
3604 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 Py_UNICODE *uni2;
3606 /* startpos for collecting untranslatable chars */
3607 const Py_UNICODE *collstart = p;
3608 const Py_UNICODE *collend = p+1;
3609 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 /* find all untranslatable characters */
3612 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003613 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 goto onError;
3615 Py_XDECREF(x);
3616 if (x!=Py_None)
3617 break;
3618 ++collend;
3619 }
3620 /* cache callback name lookup
3621 * (if not done yet, i.e. it's the first error) */
3622 if (known_errorHandler==-1) {
3623 if ((errors==NULL) || (!strcmp(errors, "strict")))
3624 known_errorHandler = 1;
3625 else if (!strcmp(errors, "replace"))
3626 known_errorHandler = 2;
3627 else if (!strcmp(errors, "ignore"))
3628 known_errorHandler = 3;
3629 else if (!strcmp(errors, "xmlcharrefreplace"))
3630 known_errorHandler = 4;
3631 else
3632 known_errorHandler = 0;
3633 }
3634 switch (known_errorHandler) {
3635 case 1: /* strict */
3636 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3637 goto onError;
3638 case 2: /* replace */
3639 /* No need to check for space, this is a 1:1 replacement */
3640 for (coll = collstart; coll<collend; ++coll)
3641 *str++ = '?';
3642 /* fall through */
3643 case 3: /* ignore */
3644 p = collend;
3645 break;
3646 case 4: /* xmlcharrefreplace */
3647 /* generate replacement (temporarily (mis)uses p) */
3648 for (p = collstart; p < collend; ++p) {
3649 char buffer[2+29+1+1];
3650 char *cp;
3651 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003652 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3654 goto onError;
3655 for (cp = buffer; *cp; ++cp)
3656 *str++ = *cp;
3657 }
3658 p = collend;
3659 break;
3660 default:
3661 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3662 reason, startp, size, &exc,
3663 collstart-startp, collend-startp, &newpos);
3664 if (repunicode == NULL)
3665 goto onError;
3666 /* generate replacement */
3667 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003668 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3670 Py_DECREF(repunicode);
3671 goto onError;
3672 }
3673 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3674 *str++ = *uni2;
3675 p = startp + newpos;
3676 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 }
3678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680 /* Resize if we allocated to much */
3681 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003682 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003683 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 }
3686 Py_XDECREF(exc);
3687 Py_XDECREF(errorHandler);
3688 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 onError:
3691 Py_XDECREF(res);
3692 Py_XDECREF(exc);
3693 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 return NULL;
3695}
3696
3697PyObject *PyUnicode_Translate(PyObject *str,
3698 PyObject *mapping,
3699 const char *errors)
3700{
3701 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 str = PyUnicode_FromObject(str);
3704 if (str == NULL)
3705 goto onError;
3706 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3707 PyUnicode_GET_SIZE(str),
3708 mapping,
3709 errors);
3710 Py_DECREF(str);
3711 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003712
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 onError:
3714 Py_XDECREF(str);
3715 return NULL;
3716}
Tim Petersced69f82003-09-16 20:30:58 +00003717
Guido van Rossum9e896b32000-04-05 20:11:21 +00003718/* --- Decimal Encoder ---------------------------------------------------- */
3719
3720int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003721 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00003722 char *output,
3723 const char *errors)
3724{
3725 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 PyObject *errorHandler = NULL;
3727 PyObject *exc = NULL;
3728 const char *encoding = "decimal";
3729 const char *reason = "invalid decimal Unicode string";
3730 /* the following variable is used for caching string comparisons
3731 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3732 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003733
3734 if (output == NULL) {
3735 PyErr_BadArgument();
3736 return -1;
3737 }
3738
3739 p = s;
3740 end = s + length;
3741 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003743 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003745 Py_ssize_t repsize;
3746 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 Py_UNICODE *uni2;
3748 Py_UNICODE *collstart;
3749 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003750
Guido van Rossum9e896b32000-04-05 20:11:21 +00003751 if (Py_UNICODE_ISSPACE(ch)) {
3752 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003754 continue;
3755 }
3756 decimal = Py_UNICODE_TODECIMAL(ch);
3757 if (decimal >= 0) {
3758 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003760 continue;
3761 }
Guido van Rossumba477042000-04-06 18:18:10 +00003762 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003763 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003765 continue;
3766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 /* All other characters are considered unencodable */
3768 collstart = p;
3769 collend = p+1;
3770 while (collend < end) {
3771 if ((0 < *collend && *collend < 256) ||
3772 !Py_UNICODE_ISSPACE(*collend) ||
3773 Py_UNICODE_TODECIMAL(*collend))
3774 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 /* cache callback name lookup
3777 * (if not done yet, i.e. it's the first error) */
3778 if (known_errorHandler==-1) {
3779 if ((errors==NULL) || (!strcmp(errors, "strict")))
3780 known_errorHandler = 1;
3781 else if (!strcmp(errors, "replace"))
3782 known_errorHandler = 2;
3783 else if (!strcmp(errors, "ignore"))
3784 known_errorHandler = 3;
3785 else if (!strcmp(errors, "xmlcharrefreplace"))
3786 known_errorHandler = 4;
3787 else
3788 known_errorHandler = 0;
3789 }
3790 switch (known_errorHandler) {
3791 case 1: /* strict */
3792 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3793 goto onError;
3794 case 2: /* replace */
3795 for (p = collstart; p < collend; ++p)
3796 *output++ = '?';
3797 /* fall through */
3798 case 3: /* ignore */
3799 p = collend;
3800 break;
3801 case 4: /* xmlcharrefreplace */
3802 /* generate replacement (temporarily (mis)uses p) */
3803 for (p = collstart; p < collend; ++p)
3804 output += sprintf(output, "&#%d;", (int)*p);
3805 p = collend;
3806 break;
3807 default:
3808 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3809 encoding, reason, s, length, &exc,
3810 collstart-s, collend-s, &newpos);
3811 if (repunicode == NULL)
3812 goto onError;
3813 /* generate replacement */
3814 repsize = PyUnicode_GET_SIZE(repunicode);
3815 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3816 Py_UNICODE ch = *uni2;
3817 if (Py_UNICODE_ISSPACE(ch))
3818 *output++ = ' ';
3819 else {
3820 decimal = Py_UNICODE_TODECIMAL(ch);
3821 if (decimal >= 0)
3822 *output++ = '0' + decimal;
3823 else if (0 < ch && ch < 256)
3824 *output++ = (char)ch;
3825 else {
3826 Py_DECREF(repunicode);
3827 raise_encode_exception(&exc, encoding,
3828 s, length, collstart-s, collend-s, reason);
3829 goto onError;
3830 }
3831 }
3832 }
3833 p = s + newpos;
3834 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003835 }
3836 }
3837 /* 0-terminate the output string */
3838 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 Py_XDECREF(exc);
3840 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003841 return 0;
3842
3843 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 Py_XDECREF(exc);
3845 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003846 return -1;
3847}
3848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849/* --- Helpers ------------------------------------------------------------ */
3850
Fredrik Lundhb63588c2006-05-23 18:44:25 +00003851static Py_ssize_t count(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003852 Py_ssize_t start,
3853 Py_ssize_t end,
3854 PyUnicodeObject *substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855{
Martin v. Löwis412fb672006-04-13 06:34:32 +00003856 Py_ssize_t count = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003858 if (start < 0)
3859 start += self->length;
3860 if (start < 0)
3861 start = 0;
3862 if (end > self->length)
3863 end = self->length;
3864 if (end < 0)
3865 end += self->length;
3866 if (end < 0)
3867 end = 0;
3868
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003869 if (substring->length == 0)
3870 return (end - start + 1);
3871
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 end -= substring->length;
3873
3874 while (start <= end)
3875 if (Py_UNICODE_MATCH(self, start, substring)) {
3876 count++;
3877 start += substring->length;
3878 } else
3879 start++;
3880
3881 return count;
3882}
3883
Martin v. Löwis18e16552006-02-15 17:27:45 +00003884Py_ssize_t PyUnicode_Count(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003886 Py_ssize_t start,
3887 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003889 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003890
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 str = PyUnicode_FromObject(str);
3892 if (str == NULL)
3893 return -1;
3894 substr = PyUnicode_FromObject(substr);
3895 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003896 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 return -1;
3898 }
Tim Petersced69f82003-09-16 20:30:58 +00003899
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 result = count((PyUnicodeObject *)str,
3901 start, end,
3902 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003903
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 Py_DECREF(str);
3905 Py_DECREF(substr);
3906 return result;
3907}
3908
Fredrik Lundhb63588c2006-05-23 18:44:25 +00003909static Py_ssize_t findstring(PyUnicodeObject *self,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003911 Py_ssize_t start,
3912 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 int direction)
3914{
3915 if (start < 0)
3916 start += self->length;
3917 if (start < 0)
3918 start = 0;
3919
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 if (end > self->length)
3921 end = self->length;
3922 if (end < 0)
3923 end += self->length;
3924 if (end < 0)
3925 end = 0;
3926
Guido van Rossum76afbd92002-08-20 17:29:29 +00003927 if (substring->length == 0)
3928 return (direction > 0) ? start : end;
3929
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 end -= substring->length;
3931
3932 if (direction < 0) {
3933 for (; end >= start; end--)
3934 if (Py_UNICODE_MATCH(self, end, substring))
3935 return end;
3936 } else {
3937 for (; start <= end; start++)
3938 if (Py_UNICODE_MATCH(self, start, substring))
3939 return start;
3940 }
3941
3942 return -1;
3943}
3944
Martin v. Löwis18e16552006-02-15 17:27:45 +00003945Py_ssize_t PyUnicode_Find(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003947 Py_ssize_t start,
3948 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 int direction)
3950{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003951 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00003952
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 str = PyUnicode_FromObject(str);
3954 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003955 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 substr = PyUnicode_FromObject(substr);
3957 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003958 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003959 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 }
Tim Petersced69f82003-09-16 20:30:58 +00003961
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 result = findstring((PyUnicodeObject *)str,
3963 (PyUnicodeObject *)substr,
3964 start, end, direction);
3965 Py_DECREF(str);
3966 Py_DECREF(substr);
3967 return result;
3968}
3969
Tim Petersced69f82003-09-16 20:30:58 +00003970static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971int tailmatch(PyUnicodeObject *self,
3972 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003973 Py_ssize_t start,
3974 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 int direction)
3976{
3977 if (start < 0)
3978 start += self->length;
3979 if (start < 0)
3980 start = 0;
3981
3982 if (substring->length == 0)
3983 return 1;
3984
3985 if (end > self->length)
3986 end = self->length;
3987 if (end < 0)
3988 end += self->length;
3989 if (end < 0)
3990 end = 0;
3991
3992 end -= substring->length;
3993 if (end < start)
3994 return 0;
3995
3996 if (direction > 0) {
3997 if (Py_UNICODE_MATCH(self, end, substring))
3998 return 1;
3999 } else {
4000 if (Py_UNICODE_MATCH(self, start, substring))
4001 return 1;
4002 }
4003
4004 return 0;
4005}
4006
Martin v. Löwis18e16552006-02-15 17:27:45 +00004007Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004009 Py_ssize_t start,
4010 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 int direction)
4012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004013 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004014
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 str = PyUnicode_FromObject(str);
4016 if (str == NULL)
4017 return -1;
4018 substr = PyUnicode_FromObject(substr);
4019 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004020 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 return -1;
4022 }
Tim Petersced69f82003-09-16 20:30:58 +00004023
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 result = tailmatch((PyUnicodeObject *)str,
4025 (PyUnicodeObject *)substr,
4026 start, end, direction);
4027 Py_DECREF(str);
4028 Py_DECREF(substr);
4029 return result;
4030}
4031
Tim Petersced69f82003-09-16 20:30:58 +00004032static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033const Py_UNICODE *findchar(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004034 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 Py_UNICODE ch)
4036{
4037 /* like wcschr, but doesn't stop at NULL characters */
4038
4039 while (size-- > 0) {
4040 if (*s == ch)
4041 return s;
4042 s++;
4043 }
4044
4045 return NULL;
4046}
4047
4048/* Apply fixfct filter to the Unicode object self and return a
4049 reference to the modified object */
4050
Tim Petersced69f82003-09-16 20:30:58 +00004051static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052PyObject *fixup(PyUnicodeObject *self,
4053 int (*fixfct)(PyUnicodeObject *s))
4054{
4055
4056 PyUnicodeObject *u;
4057
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004058 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 if (u == NULL)
4060 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004061
4062 Py_UNICODE_COPY(u->str, self->str, self->length);
4063
Tim Peters7a29bd52001-09-12 03:03:31 +00004064 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 /* fixfct should return TRUE if it modified the buffer. If
4066 FALSE, return a reference to the original buffer instead
4067 (to save space, not time) */
4068 Py_INCREF(self);
4069 Py_DECREF(u);
4070 return (PyObject*) self;
4071 }
4072 return (PyObject*) u;
4073}
4074
Tim Petersced69f82003-09-16 20:30:58 +00004075static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076int fixupper(PyUnicodeObject *self)
4077{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004078 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 Py_UNICODE *s = self->str;
4080 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004081
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 while (len-- > 0) {
4083 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004084
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 ch = Py_UNICODE_TOUPPER(*s);
4086 if (ch != *s) {
4087 status = 1;
4088 *s = ch;
4089 }
4090 s++;
4091 }
4092
4093 return status;
4094}
4095
Tim Petersced69f82003-09-16 20:30:58 +00004096static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097int fixlower(PyUnicodeObject *self)
4098{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004099 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 Py_UNICODE *s = self->str;
4101 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004102
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 while (len-- > 0) {
4104 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004105
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 ch = Py_UNICODE_TOLOWER(*s);
4107 if (ch != *s) {
4108 status = 1;
4109 *s = ch;
4110 }
4111 s++;
4112 }
4113
4114 return status;
4115}
4116
Tim Petersced69f82003-09-16 20:30:58 +00004117static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118int fixswapcase(PyUnicodeObject *self)
4119{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 Py_UNICODE *s = self->str;
4122 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 while (len-- > 0) {
4125 if (Py_UNICODE_ISUPPER(*s)) {
4126 *s = Py_UNICODE_TOLOWER(*s);
4127 status = 1;
4128 } else if (Py_UNICODE_ISLOWER(*s)) {
4129 *s = Py_UNICODE_TOUPPER(*s);
4130 status = 1;
4131 }
4132 s++;
4133 }
4134
4135 return status;
4136}
4137
Tim Petersced69f82003-09-16 20:30:58 +00004138static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139int fixcapitalize(PyUnicodeObject *self)
4140{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004141 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004142 Py_UNICODE *s = self->str;
4143 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004144
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004145 if (len == 0)
4146 return 0;
4147 if (Py_UNICODE_ISLOWER(*s)) {
4148 *s = Py_UNICODE_TOUPPER(*s);
4149 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004151 s++;
4152 while (--len > 0) {
4153 if (Py_UNICODE_ISUPPER(*s)) {
4154 *s = Py_UNICODE_TOLOWER(*s);
4155 status = 1;
4156 }
4157 s++;
4158 }
4159 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160}
4161
4162static
4163int fixtitle(PyUnicodeObject *self)
4164{
4165 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4166 register Py_UNICODE *e;
4167 int previous_is_cased;
4168
4169 /* Shortcut for single character strings */
4170 if (PyUnicode_GET_SIZE(self) == 1) {
4171 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4172 if (*p != ch) {
4173 *p = ch;
4174 return 1;
4175 }
4176 else
4177 return 0;
4178 }
Tim Petersced69f82003-09-16 20:30:58 +00004179
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 e = p + PyUnicode_GET_SIZE(self);
4181 previous_is_cased = 0;
4182 for (; p < e; p++) {
4183 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004184
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 if (previous_is_cased)
4186 *p = Py_UNICODE_TOLOWER(ch);
4187 else
4188 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004189
4190 if (Py_UNICODE_ISLOWER(ch) ||
4191 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 Py_UNICODE_ISTITLE(ch))
4193 previous_is_cased = 1;
4194 else
4195 previous_is_cased = 0;
4196 }
4197 return 1;
4198}
4199
Tim Peters8ce9f162004-08-27 01:49:32 +00004200PyObject *
4201PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202{
Tim Peters8ce9f162004-08-27 01:49:32 +00004203 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004204 const Py_UNICODE blank = ' ';
4205 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004206 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004207 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004208 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4209 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004210 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4211 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004212 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004213 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004214 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215
Tim Peters05eba1f2004-08-27 21:32:02 +00004216 fseq = PySequence_Fast(seq, "");
4217 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004218 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004219 }
4220
Tim Peters91879ab2004-08-27 22:35:44 +00004221 /* Grrrr. A codec may be invoked to convert str objects to
4222 * Unicode, and so it's possible to call back into Python code
4223 * during PyUnicode_FromObject(), and so it's possible for a sick
4224 * codec to change the size of fseq (if seq is a list). Therefore
4225 * we have to keep refetching the size -- can't assume seqlen
4226 * is invariant.
4227 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004228 seqlen = PySequence_Fast_GET_SIZE(fseq);
4229 /* If empty sequence, return u"". */
4230 if (seqlen == 0) {
4231 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4232 goto Done;
4233 }
4234 /* If singleton sequence with an exact Unicode, return that. */
4235 if (seqlen == 1) {
4236 item = PySequence_Fast_GET_ITEM(fseq, 0);
4237 if (PyUnicode_CheckExact(item)) {
4238 Py_INCREF(item);
4239 res = (PyUnicodeObject *)item;
4240 goto Done;
4241 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004242 }
4243
Tim Peters05eba1f2004-08-27 21:32:02 +00004244 /* At least two items to join, or one that isn't exact Unicode. */
4245 if (seqlen > 1) {
4246 /* Set up sep and seplen -- they're needed. */
4247 if (separator == NULL) {
4248 sep = &blank;
4249 seplen = 1;
4250 }
4251 else {
4252 internal_separator = PyUnicode_FromObject(separator);
4253 if (internal_separator == NULL)
4254 goto onError;
4255 sep = PyUnicode_AS_UNICODE(internal_separator);
4256 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004257 /* In case PyUnicode_FromObject() mutated seq. */
4258 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004259 }
4260 }
4261
4262 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004263 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004264 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004265 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004266 res_p = PyUnicode_AS_UNICODE(res);
4267 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004268
Tim Peters05eba1f2004-08-27 21:32:02 +00004269 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004270 Py_ssize_t itemlen;
4271 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004272
4273 item = PySequence_Fast_GET_ITEM(fseq, i);
4274 /* Convert item to Unicode. */
4275 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4276 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004277 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004278 " %.80s found",
4279 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004280 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004281 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004282 item = PyUnicode_FromObject(item);
4283 if (item == NULL)
4284 goto onError;
4285 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004286
Tim Peters91879ab2004-08-27 22:35:44 +00004287 /* In case PyUnicode_FromObject() mutated seq. */
4288 seqlen = PySequence_Fast_GET_SIZE(fseq);
4289
Tim Peters8ce9f162004-08-27 01:49:32 +00004290 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004292 new_res_used = res_used + itemlen;
Tim Peters286085c2006-05-22 19:17:04 +00004293 if (new_res_used <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004294 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004295 if (i < seqlen - 1) {
4296 new_res_used += seplen;
Tim Peters286085c2006-05-22 19:17:04 +00004297 if (new_res_used <= 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004298 goto Overflow;
4299 }
4300 if (new_res_used > res_alloc) {
4301 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004302 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004303 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004304 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004305 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004306 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004307 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004308 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004310 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004311 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004313
4314 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004315 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004316 res_p += itemlen;
4317 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004318 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004319 res_p += seplen;
4320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004322 res_used = new_res_used;
4323 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004324
Tim Peters05eba1f2004-08-27 21:32:02 +00004325 /* Shrink res to match the used area; this probably can't fail,
4326 * but it's cheap to check.
4327 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004328 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004329 goto onError;
4330
4331 Done:
4332 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004333 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 return (PyObject *)res;
4335
Tim Peters8ce9f162004-08-27 01:49:32 +00004336 Overflow:
4337 PyErr_SetString(PyExc_OverflowError,
4338 "join() is too long for a Python string");
4339 Py_DECREF(item);
4340 /* fall through */
4341
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004343 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004344 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004345 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346 return NULL;
4347}
4348
Tim Petersced69f82003-09-16 20:30:58 +00004349static
4350PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 Py_ssize_t left,
4352 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 Py_UNICODE fill)
4354{
4355 PyUnicodeObject *u;
4356
4357 if (left < 0)
4358 left = 0;
4359 if (right < 0)
4360 right = 0;
4361
Tim Peters7a29bd52001-09-12 03:03:31 +00004362 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 Py_INCREF(self);
4364 return self;
4365 }
4366
4367 u = _PyUnicode_New(left + self->length + right);
4368 if (u) {
4369 if (left)
4370 Py_UNICODE_FILL(u->str, fill, left);
4371 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4372 if (right)
4373 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4374 }
4375
4376 return u;
4377}
4378
4379#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004380 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 if (!str) \
4382 goto onError; \
4383 if (PyList_Append(list, str)) { \
4384 Py_DECREF(str); \
4385 goto onError; \
4386 } \
4387 else \
4388 Py_DECREF(str);
4389
4390static
4391PyObject *split_whitespace(PyUnicodeObject *self,
4392 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004393 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004395 register Py_ssize_t i;
4396 register Py_ssize_t j;
4397 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 PyObject *str;
4399
4400 for (i = j = 0; i < len; ) {
4401 /* find a token */
4402 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4403 i++;
4404 j = i;
4405 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4406 i++;
4407 if (j < i) {
4408 if (maxcount-- <= 0)
4409 break;
4410 SPLIT_APPEND(self->str, j, i);
4411 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4412 i++;
4413 j = i;
4414 }
4415 }
4416 if (j < len) {
4417 SPLIT_APPEND(self->str, j, len);
4418 }
4419 return list;
4420
4421 onError:
4422 Py_DECREF(list);
4423 return NULL;
4424}
4425
4426PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004427 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004429 register Py_ssize_t i;
4430 register Py_ssize_t j;
4431 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 PyObject *list;
4433 PyObject *str;
4434 Py_UNICODE *data;
4435
4436 string = PyUnicode_FromObject(string);
4437 if (string == NULL)
4438 return NULL;
4439 data = PyUnicode_AS_UNICODE(string);
4440 len = PyUnicode_GET_SIZE(string);
4441
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 list = PyList_New(0);
4443 if (!list)
4444 goto onError;
4445
4446 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004447 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00004448
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004450 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452
4453 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004454 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 if (i < len) {
4456 if (data[i] == '\r' && i + 1 < len &&
4457 data[i+1] == '\n')
4458 i += 2;
4459 else
4460 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004461 if (keepends)
4462 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 }
Guido van Rossum86662912000-04-11 15:38:46 +00004464 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 j = i;
4466 }
4467 if (j < len) {
4468 SPLIT_APPEND(data, j, len);
4469 }
4470
4471 Py_DECREF(string);
4472 return list;
4473
4474 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004475 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 Py_DECREF(string);
4477 return NULL;
4478}
4479
Tim Petersced69f82003-09-16 20:30:58 +00004480static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481PyObject *split_char(PyUnicodeObject *self,
4482 PyObject *list,
4483 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004484 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004486 register Py_ssize_t i;
4487 register Py_ssize_t j;
4488 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 PyObject *str;
4490
4491 for (i = j = 0; i < len; ) {
4492 if (self->str[i] == ch) {
4493 if (maxcount-- <= 0)
4494 break;
4495 SPLIT_APPEND(self->str, j, i);
4496 i = j = i + 1;
4497 } else
4498 i++;
4499 }
4500 if (j <= len) {
4501 SPLIT_APPEND(self->str, j, len);
4502 }
4503 return list;
4504
4505 onError:
4506 Py_DECREF(list);
4507 return NULL;
4508}
4509
Tim Petersced69f82003-09-16 20:30:58 +00004510static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511PyObject *split_substring(PyUnicodeObject *self,
4512 PyObject *list,
4513 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004514 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004516 register Py_ssize_t i;
4517 register Py_ssize_t j;
4518 Py_ssize_t len = self->length;
4519 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 PyObject *str;
4521
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004522 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 if (Py_UNICODE_MATCH(self, i, substring)) {
4524 if (maxcount-- <= 0)
4525 break;
4526 SPLIT_APPEND(self->str, j, i);
4527 i = j = i + sublen;
4528 } else
4529 i++;
4530 }
4531 if (j <= len) {
4532 SPLIT_APPEND(self->str, j, len);
4533 }
4534 return list;
4535
4536 onError:
4537 Py_DECREF(list);
4538 return NULL;
4539}
4540
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004541static
4542PyObject *rsplit_whitespace(PyUnicodeObject *self,
4543 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004544 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004546 register Py_ssize_t i;
4547 register Py_ssize_t j;
4548 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004549 PyObject *str;
4550
4551 for (i = j = len - 1; i >= 0; ) {
4552 /* find a token */
4553 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4554 i--;
4555 j = i;
4556 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4557 i--;
4558 if (j > i) {
4559 if (maxcount-- <= 0)
4560 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004561 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004562 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4563 i--;
4564 j = i;
4565 }
4566 }
4567 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004568 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004569 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004570 if (PyList_Reverse(list) < 0)
4571 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004572 return list;
4573
4574 onError:
4575 Py_DECREF(list);
4576 return NULL;
4577}
4578
4579static
4580PyObject *rsplit_char(PyUnicodeObject *self,
4581 PyObject *list,
4582 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004583 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004584{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 register Py_ssize_t i;
4586 register Py_ssize_t j;
4587 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004588 PyObject *str;
4589
4590 for (i = j = len - 1; i >= 0; ) {
4591 if (self->str[i] == ch) {
4592 if (maxcount-- <= 0)
4593 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004594 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004595 j = i = i - 1;
4596 } else
4597 i--;
4598 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004599 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004600 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004601 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004602 if (PyList_Reverse(list) < 0)
4603 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004604 return list;
4605
4606 onError:
4607 Py_DECREF(list);
4608 return NULL;
4609}
4610
4611static
4612PyObject *rsplit_substring(PyUnicodeObject *self,
4613 PyObject *list,
4614 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004615 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004616{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004617 register Py_ssize_t i;
4618 register Py_ssize_t j;
4619 Py_ssize_t len = self->length;
4620 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004621 PyObject *str;
4622
4623 for (i = len - sublen, j = len; i >= 0; ) {
4624 if (Py_UNICODE_MATCH(self, i, substring)) {
4625 if (maxcount-- <= 0)
4626 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004627 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004628 j = i;
4629 i -= sublen;
4630 } else
4631 i--;
4632 }
4633 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004634 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004635 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00004636 if (PyList_Reverse(list) < 0)
4637 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004638 return list;
4639
4640 onError:
4641 Py_DECREF(list);
4642 return NULL;
4643}
4644
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645#undef SPLIT_APPEND
4646
4647static
4648PyObject *split(PyUnicodeObject *self,
4649 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004650 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651{
4652 PyObject *list;
4653
4654 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004655 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656
4657 list = PyList_New(0);
4658 if (!list)
4659 return NULL;
4660
4661 if (substring == NULL)
4662 return split_whitespace(self,list,maxcount);
4663
4664 else if (substring->length == 1)
4665 return split_char(self,list,substring->str[0],maxcount);
4666
4667 else if (substring->length == 0) {
4668 Py_DECREF(list);
4669 PyErr_SetString(PyExc_ValueError, "empty separator");
4670 return NULL;
4671 }
4672 else
4673 return split_substring(self,list,substring,maxcount);
4674}
4675
Tim Petersced69f82003-09-16 20:30:58 +00004676static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004677PyObject *rsplit(PyUnicodeObject *self,
4678 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004680{
4681 PyObject *list;
4682
4683 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004684 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004685
4686 list = PyList_New(0);
4687 if (!list)
4688 return NULL;
4689
4690 if (substring == NULL)
4691 return rsplit_whitespace(self,list,maxcount);
4692
4693 else if (substring->length == 1)
4694 return rsplit_char(self,list,substring->str[0],maxcount);
4695
4696 else if (substring->length == 0) {
4697 Py_DECREF(list);
4698 PyErr_SetString(PyExc_ValueError, "empty separator");
4699 return NULL;
4700 }
4701 else
4702 return rsplit_substring(self,list,substring,maxcount);
4703}
4704
4705static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706PyObject *replace(PyUnicodeObject *self,
4707 PyUnicodeObject *str1,
4708 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004709 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710{
4711 PyUnicodeObject *u;
4712
4713 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00004714 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715
4716 if (str1->length == 1 && str2->length == 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004717 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
4719 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004720 if (!findchar(self->str, self->length, str1->str[0]) &&
4721 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 /* nothing to replace, return original string */
4723 Py_INCREF(self);
4724 u = self;
4725 } else {
4726 Py_UNICODE u1 = str1->str[0];
4727 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004730 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 self->length
4732 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004733 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004734 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004735 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 for (i = 0; i < u->length; i++)
4737 if (u->str[i] == u1) {
4738 if (--maxcount < 0)
4739 break;
4740 u->str[i] = u2;
4741 }
4742 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
4745 } else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004746 Py_ssize_t n, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 Py_UNICODE *p;
4748
4749 /* replace strings */
4750 n = count(self, 0, self->length, str1);
4751 if (n > maxcount)
4752 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004753 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004755 if (PyUnicode_CheckExact(self)) {
4756 Py_INCREF(self);
4757 u = self;
4758 }
4759 else {
4760 u = (PyUnicodeObject *)
4761 PyUnicode_FromUnicode(self->str, self->length);
4762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 } else {
4764 u = _PyUnicode_New(
4765 self->length + n * (str2->length - str1->length));
4766 if (u) {
4767 i = 0;
4768 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004769 if (str1->length > 0) {
4770 while (i <= self->length - str1->length)
4771 if (Py_UNICODE_MATCH(self, i, str1)) {
4772 /* replace string segment */
4773 Py_UNICODE_COPY(p, str2->str, str2->length);
4774 p += str2->length;
4775 i += str1->length;
4776 if (--n <= 0) {
4777 /* copy remaining part */
4778 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4779 break;
4780 }
4781 } else
4782 *p++ = self->str[i++];
4783 } else {
4784 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 Py_UNICODE_COPY(p, str2->str, str2->length);
4786 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004787 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004790 }
4791 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 }
4794 }
4795 }
Tim Petersced69f82003-09-16 20:30:58 +00004796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797 return (PyObject *) u;
4798}
4799
4800/* --- Unicode Object Methods --------------------------------------------- */
4801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004802PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803"S.title() -> unicode\n\
4804\n\
4805Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004806characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
4808static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004809unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 return fixup(self, fixtitle);
4812}
4813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004814PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815"S.capitalize() -> unicode\n\
4816\n\
4817Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004818have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819
4820static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004821unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 return fixup(self, fixcapitalize);
4824}
4825
4826#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004827PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828"S.capwords() -> unicode\n\
4829\n\
4830Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004831normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832
4833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004834unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
4836 PyObject *list;
4837 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004838 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 /* Split into words */
4841 list = split(self, NULL, -1);
4842 if (!list)
4843 return NULL;
4844
4845 /* Capitalize each word */
4846 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4847 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4848 fixcapitalize);
4849 if (item == NULL)
4850 goto onError;
4851 Py_DECREF(PyList_GET_ITEM(list, i));
4852 PyList_SET_ITEM(list, i, item);
4853 }
4854
4855 /* Join the words to form a new string */
4856 item = PyUnicode_Join(NULL, list);
4857
4858onError:
4859 Py_DECREF(list);
4860 return (PyObject *)item;
4861}
4862#endif
4863
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004864/* Argument converter. Coerces to a single unicode character */
4865
4866static int
4867convert_uc(PyObject *obj, void *addr)
4868{
4869 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4870 PyObject *uniobj;
4871 Py_UNICODE *unistr;
4872
4873 uniobj = PyUnicode_FromObject(obj);
4874 if (uniobj == NULL) {
4875 PyErr_SetString(PyExc_TypeError,
4876 "The fill character cannot be converted to Unicode");
4877 return 0;
4878 }
4879 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4880 PyErr_SetString(PyExc_TypeError,
4881 "The fill character must be exactly one character long");
4882 Py_DECREF(uniobj);
4883 return 0;
4884 }
4885 unistr = PyUnicode_AS_UNICODE(uniobj);
4886 *fillcharloc = unistr[0];
4887 Py_DECREF(uniobj);
4888 return 1;
4889}
4890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004891PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004892"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004894Return S centered in a Unicode string of length width. Padding is\n\
4895done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896
4897static PyObject *
4898unicode_center(PyUnicodeObject *self, PyObject *args)
4899{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 Py_ssize_t marg, left;
4901 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004902 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903
Thomas Woutersde017742006-02-16 19:34:37 +00004904 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 return NULL;
4906
Tim Peters7a29bd52001-09-12 03:03:31 +00004907 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 Py_INCREF(self);
4909 return (PyObject*) self;
4910 }
4911
4912 marg = width - self->length;
4913 left = marg / 2 + (marg & width & 1);
4914
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004915 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916}
4917
Marc-André Lemburge5034372000-08-08 08:04:29 +00004918#if 0
4919
4920/* This code should go into some future Unicode collation support
4921 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004922 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004923
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004924/* speedy UTF-16 code point order comparison */
4925/* gleaned from: */
4926/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4927
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004928static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004929{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004930 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004931 0, 0, 0, 0, 0, 0, 0, 0,
4932 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004933 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004934};
4935
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936static int
4937unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4938{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004939 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004940
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941 Py_UNICODE *s1 = str1->str;
4942 Py_UNICODE *s2 = str2->str;
4943
4944 len1 = str1->length;
4945 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004946
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004948 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004949
4950 c1 = *s1++;
4951 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004952
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004953 if (c1 > (1<<11) * 26)
4954 c1 += utf16Fixup[c1>>11];
4955 if (c2 > (1<<11) * 26)
4956 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004957 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004958
4959 if (c1 != c2)
4960 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004961
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004962 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963 }
4964
4965 return (len1 < len2) ? -1 : (len1 != len2);
4966}
4967
Marc-André Lemburge5034372000-08-08 08:04:29 +00004968#else
4969
4970static int
4971unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4972{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004973 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004974
4975 Py_UNICODE *s1 = str1->str;
4976 Py_UNICODE *s2 = str2->str;
4977
4978 len1 = str1->length;
4979 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004980
Marc-André Lemburge5034372000-08-08 08:04:29 +00004981 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004982 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004983
Fredrik Lundh45714e92001-06-26 16:39:36 +00004984 c1 = *s1++;
4985 c2 = *s2++;
4986
4987 if (c1 != c2)
4988 return (c1 < c2) ? -1 : 1;
4989
Marc-André Lemburge5034372000-08-08 08:04:29 +00004990 len1--; len2--;
4991 }
4992
4993 return (len1 < len2) ? -1 : (len1 != len2);
4994}
4995
4996#endif
4997
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998int PyUnicode_Compare(PyObject *left,
4999 PyObject *right)
5000{
5001 PyUnicodeObject *u = NULL, *v = NULL;
5002 int result;
5003
5004 /* Coerce the two arguments */
5005 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5006 if (u == NULL)
5007 goto onError;
5008 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5009 if (v == NULL)
5010 goto onError;
5011
Thomas Wouters7e474022000-07-16 12:04:32 +00005012 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 if (v == u) {
5014 Py_DECREF(u);
5015 Py_DECREF(v);
5016 return 0;
5017 }
5018
5019 result = unicode_compare(u, v);
5020
5021 Py_DECREF(u);
5022 Py_DECREF(v);
5023 return result;
5024
5025onError:
5026 Py_XDECREF(u);
5027 Py_XDECREF(v);
5028 return -1;
5029}
5030
Guido van Rossum403d68b2000-03-13 15:55:09 +00005031int PyUnicode_Contains(PyObject *container,
5032 PyObject *element)
5033{
Fredrik Lundh833bf942006-05-23 10:12:21 +00005034 PyUnicodeObject *u, *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005035 int result;
5036 Py_ssize_t size;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005037
5038 /* Coerce the two arguments */
Fredrik Lundh833bf942006-05-23 10:12:21 +00005039 v = (PyUnicodeObject *) PyUnicode_FromObject(element);
5040 if (!v) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005041 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005042 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005043 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005044 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005045
5046 u = (PyUnicodeObject *) PyUnicode_FromObject(container);
5047 if (!u) {
5048 Py_DECREF(v);
5049 return -1;
5050 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005051
Barry Warsaw817918c2002-08-06 16:58:21 +00005052 size = PyUnicode_GET_SIZE(v);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005053 if (!size) {
5054 result = 1;
5055 goto done;
5056 }
Barry Warsaw817918c2002-08-06 16:58:21 +00005057
Guido van Rossum403d68b2000-03-13 15:55:09 +00005058 result = 0;
Fredrik Lundh833bf942006-05-23 10:12:21 +00005059
Barry Warsaw817918c2002-08-06 16:58:21 +00005060 if (size == 1) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00005061 Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
5062 Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
5063 Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
5064 for (; ptr < end; ptr++) {
5065 if (*ptr == chr) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005066 result = 1;
5067 break;
5068 }
5069 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005070 } else {
5071 int start = 0;
5072 int end = PyUnicode_GET_SIZE(u) - size;
5073 for (; start <= end; start++)
5074 if (Py_UNICODE_MATCH(u, start, v)) {
5075 result = 1;
5076 break;
5077 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005078 }
5079
Fredrik Lundh833bf942006-05-23 10:12:21 +00005080done:
Guido van Rossum403d68b2000-03-13 15:55:09 +00005081 Py_DECREF(u);
5082 Py_DECREF(v);
5083 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005084}
5085
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086/* Concat to string or Unicode object giving a new Unicode object. */
5087
5088PyObject *PyUnicode_Concat(PyObject *left,
5089 PyObject *right)
5090{
5091 PyUnicodeObject *u = NULL, *v = NULL, *w;
5092
5093 /* Coerce the two arguments */
5094 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5095 if (u == NULL)
5096 goto onError;
5097 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5098 if (v == NULL)
5099 goto onError;
5100
5101 /* Shortcuts */
5102 if (v == unicode_empty) {
5103 Py_DECREF(v);
5104 return (PyObject *)u;
5105 }
5106 if (u == unicode_empty) {
5107 Py_DECREF(u);
5108 return (PyObject *)v;
5109 }
5110
5111 /* Concat the two Unicode strings */
5112 w = _PyUnicode_New(u->length + v->length);
5113 if (w == NULL)
5114 goto onError;
5115 Py_UNICODE_COPY(w->str, u->str, u->length);
5116 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5117
5118 Py_DECREF(u);
5119 Py_DECREF(v);
5120 return (PyObject *)w;
5121
5122onError:
5123 Py_XDECREF(u);
5124 Py_XDECREF(v);
5125 return NULL;
5126}
5127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005128PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129"S.count(sub[, start[, end]]) -> int\n\
5130\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005131Return the number of non-overlapping occurrences of substring sub in\n\
5132Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005133interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134
5135static PyObject *
5136unicode_count(PyUnicodeObject *self, PyObject *args)
5137{
5138 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005139 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005140 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 PyObject *result;
5142
Guido van Rossumb8872e62000-05-09 14:14:27 +00005143 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5144 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 return NULL;
5146
5147 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5148 (PyObject *)substring);
5149 if (substring == NULL)
5150 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005151
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 if (start < 0)
5153 start += self->length;
5154 if (start < 0)
5155 start = 0;
5156 if (end > self->length)
5157 end = self->length;
5158 if (end < 0)
5159 end += self->length;
5160 if (end < 0)
5161 end = 0;
5162
5163 result = PyInt_FromLong((long) count(self, start, end, substring));
5164
5165 Py_DECREF(substring);
5166 return result;
5167}
5168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005169PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005170"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005172Encodes S using the codec registered for encoding. encoding defaults\n\
5173to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005174handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005175a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5176'xmlcharrefreplace' as well as any other name registered with\n\
5177codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178
5179static PyObject *
5180unicode_encode(PyUnicodeObject *self, PyObject *args)
5181{
5182 char *encoding = NULL;
5183 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005184 PyObject *v;
5185
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5187 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005188 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005189 if (v == NULL)
5190 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005191 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5192 PyErr_Format(PyExc_TypeError,
5193 "encoder did not return a string/unicode object "
5194 "(type=%.400s)",
5195 v->ob_type->tp_name);
5196 Py_DECREF(v);
5197 return NULL;
5198 }
5199 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005200
5201 onError:
5202 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005203}
5204
5205PyDoc_STRVAR(decode__doc__,
5206"S.decode([encoding[,errors]]) -> string or unicode\n\
5207\n\
5208Decodes S using the codec registered for encoding. encoding defaults\n\
5209to the default encoding. errors may be given to set a different error\n\
5210handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5211a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5212as well as any other name registerd with codecs.register_error that is\n\
5213able to handle UnicodeDecodeErrors.");
5214
5215static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005216unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005217{
5218 char *encoding = NULL;
5219 char *errors = NULL;
5220 PyObject *v;
5221
5222 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5223 return NULL;
5224 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005225 if (v == NULL)
5226 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005227 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5228 PyErr_Format(PyExc_TypeError,
5229 "decoder did not return a string/unicode object "
5230 "(type=%.400s)",
5231 v->ob_type->tp_name);
5232 Py_DECREF(v);
5233 return NULL;
5234 }
5235 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005236
5237 onError:
5238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239}
5240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005241PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242"S.expandtabs([tabsize]) -> unicode\n\
5243\n\
5244Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005245If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246
5247static PyObject*
5248unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5249{
5250 Py_UNICODE *e;
5251 Py_UNICODE *p;
5252 Py_UNICODE *q;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005253 Py_ssize_t i, j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 PyUnicodeObject *u;
5255 int tabsize = 8;
5256
5257 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5258 return NULL;
5259
Thomas Wouters7e474022000-07-16 12:04:32 +00005260 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 i = j = 0;
5262 e = self->str + self->length;
5263 for (p = self->str; p < e; p++)
5264 if (*p == '\t') {
5265 if (tabsize > 0)
5266 j += tabsize - (j % tabsize);
5267 }
5268 else {
5269 j++;
5270 if (*p == '\n' || *p == '\r') {
5271 i += j;
5272 j = 0;
5273 }
5274 }
5275
5276 /* Second pass: create output string and fill it */
5277 u = _PyUnicode_New(i + j);
5278 if (!u)
5279 return NULL;
5280
5281 j = 0;
5282 q = u->str;
5283
5284 for (p = self->str; p < e; p++)
5285 if (*p == '\t') {
5286 if (tabsize > 0) {
5287 i = tabsize - (j % tabsize);
5288 j += i;
5289 while (i--)
5290 *q++ = ' ';
5291 }
5292 }
5293 else {
5294 j++;
5295 *q++ = *p;
5296 if (*p == '\n' || *p == '\r')
5297 j = 0;
5298 }
5299
5300 return (PyObject*) u;
5301}
5302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005303PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304"S.find(sub [,start [,end]]) -> int\n\
5305\n\
5306Return the lowest index in S where substring sub is found,\n\
5307such that sub is contained within s[start,end]. Optional\n\
5308arguments start and end are interpreted as in slice notation.\n\
5309\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005310Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
5312static PyObject *
5313unicode_find(PyUnicodeObject *self, PyObject *args)
5314{
5315 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005316 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005317 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 PyObject *result;
5319
Guido van Rossumb8872e62000-05-09 14:14:27 +00005320 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5321 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 return NULL;
5323 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5324 (PyObject *)substring);
5325 if (substring == NULL)
5326 return NULL;
5327
Martin v. Löwis18e16552006-02-15 17:27:45 +00005328 result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329
5330 Py_DECREF(substring);
5331 return result;
5332}
5333
5334static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336{
5337 if (index < 0 || index >= self->length) {
5338 PyErr_SetString(PyExc_IndexError, "string index out of range");
5339 return NULL;
5340 }
5341
5342 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5343}
5344
5345static long
5346unicode_hash(PyUnicodeObject *self)
5347{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005348 /* Since Unicode objects compare equal to their ASCII string
5349 counterparts, they should use the individual character values
5350 as basis for their hash value. This is needed to assure that
5351 strings and Unicode objects behave in the same way as
5352 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005355 register Py_UNICODE *p;
5356 register long x;
5357
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 if (self->hash != -1)
5359 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005360 len = PyUnicode_GET_SIZE(self);
5361 p = PyUnicode_AS_UNICODE(self);
5362 x = *p << 7;
5363 while (--len >= 0)
5364 x = (1000003*x) ^ *p++;
5365 x ^= PyUnicode_GET_SIZE(self);
5366 if (x == -1)
5367 x = -2;
5368 self->hash = x;
5369 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370}
5371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005372PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373"S.index(sub [,start [,end]]) -> int\n\
5374\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005375Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
5377static PyObject *
5378unicode_index(PyUnicodeObject *self, PyObject *args)
5379{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005380 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005382 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005383 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384
Guido van Rossumb8872e62000-05-09 14:14:27 +00005385 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5386 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5390 (PyObject *)substring);
5391 if (substring == NULL)
5392 return NULL;
5393
5394 result = findstring(self, substring, start, end, 1);
5395
5396 Py_DECREF(substring);
5397 if (result < 0) {
5398 PyErr_SetString(PyExc_ValueError, "substring not found");
5399 return NULL;
5400 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005401 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402}
5403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005404PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005405"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005407Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005408at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409
5410static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005411unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412{
5413 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5414 register const Py_UNICODE *e;
5415 int cased;
5416
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 /* Shortcut for single character strings */
5418 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005419 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005421 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005422 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005423 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 e = p + PyUnicode_GET_SIZE(self);
5426 cased = 0;
5427 for (; p < e; p++) {
5428 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005429
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005431 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 else if (!cased && Py_UNICODE_ISLOWER(ch))
5433 cased = 1;
5434 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005435 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436}
5437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005438PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005439"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005441Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005442at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
5444static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005445unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446{
5447 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5448 register const Py_UNICODE *e;
5449 int cased;
5450
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 /* Shortcut for single character strings */
5452 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005453 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005455 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005456 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005457 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 e = p + PyUnicode_GET_SIZE(self);
5460 cased = 0;
5461 for (; p < e; p++) {
5462 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 else if (!cased && Py_UNICODE_ISUPPER(ch))
5467 cased = 1;
5468 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005469 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470}
5471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005472PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005473"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005475Return True if S is a titlecased string and there is at least one\n\
5476character in S, i.e. upper- and titlecase characters may only\n\
5477follow uncased characters and lowercase characters only cased ones.\n\
5478Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479
5480static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005481unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482{
5483 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5484 register const Py_UNICODE *e;
5485 int cased, previous_is_cased;
5486
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 /* Shortcut for single character strings */
5488 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005489 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5490 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005492 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005493 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005494 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005495
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 e = p + PyUnicode_GET_SIZE(self);
5497 cased = 0;
5498 previous_is_cased = 0;
5499 for (; p < e; p++) {
5500 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005501
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5503 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005504 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 previous_is_cased = 1;
5506 cased = 1;
5507 }
5508 else if (Py_UNICODE_ISLOWER(ch)) {
5509 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005510 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 previous_is_cased = 1;
5512 cased = 1;
5513 }
5514 else
5515 previous_is_cased = 0;
5516 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005517 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518}
5519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005520PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005521"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005523Return True if all characters in S are whitespace\n\
5524and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
5526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005527unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528{
5529 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5530 register const Py_UNICODE *e;
5531
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 /* Shortcut for single character strings */
5533 if (PyUnicode_GET_SIZE(self) == 1 &&
5534 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005535 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005537 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005538 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005539 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 e = p + PyUnicode_GET_SIZE(self);
5542 for (; p < e; p++) {
5543 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005544 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005546 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547}
5548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005549PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005550"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005551\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005552Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005553and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005554
5555static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005556unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005557{
5558 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5559 register const Py_UNICODE *e;
5560
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005561 /* Shortcut for single character strings */
5562 if (PyUnicode_GET_SIZE(self) == 1 &&
5563 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005564 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005565
5566 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005567 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005568 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005569
5570 e = p + PyUnicode_GET_SIZE(self);
5571 for (; p < e; p++) {
5572 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005573 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005574 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005575 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005576}
5577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005578PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005579"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005580\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005581Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005582and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005583
5584static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005585unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005586{
5587 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5588 register const Py_UNICODE *e;
5589
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005590 /* Shortcut for single character strings */
5591 if (PyUnicode_GET_SIZE(self) == 1 &&
5592 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005593 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005594
5595 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005596 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005597 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005598
5599 e = p + PyUnicode_GET_SIZE(self);
5600 for (; p < e; p++) {
5601 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005602 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005603 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005604 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005605}
5606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005607PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005608"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005610Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005611False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612
5613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005614unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615{
5616 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5617 register const Py_UNICODE *e;
5618
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 /* Shortcut for single character strings */
5620 if (PyUnicode_GET_SIZE(self) == 1 &&
5621 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005622 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005624 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005625 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005627
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 e = p + PyUnicode_GET_SIZE(self);
5629 for (; p < e; p++) {
5630 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005631 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005633 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634}
5635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005636PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005637"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005639Return True if all characters in S are digits\n\
5640and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
5642static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005643unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644{
5645 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5646 register const Py_UNICODE *e;
5647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 /* Shortcut for single character strings */
5649 if (PyUnicode_GET_SIZE(self) == 1 &&
5650 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005651 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005653 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005654 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005656
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 e = p + PyUnicode_GET_SIZE(self);
5658 for (; p < e; p++) {
5659 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005660 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005662 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663}
5664
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005665PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005666"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005668Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005669False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670
5671static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005672unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673{
5674 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5675 register const Py_UNICODE *e;
5676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 /* Shortcut for single character strings */
5678 if (PyUnicode_GET_SIZE(self) == 1 &&
5679 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005680 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005682 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00005683 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005684 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005685
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 e = p + PyUnicode_GET_SIZE(self);
5687 for (; p < e; p++) {
5688 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005689 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005691 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692}
5693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005694PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695"S.join(sequence) -> unicode\n\
5696\n\
5697Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005698sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
5700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005701unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005703 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704}
5705
Martin v. Löwis18e16552006-02-15 17:27:45 +00005706static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707unicode_length(PyUnicodeObject *self)
5708{
5709 return self->length;
5710}
5711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005712PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005713"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714\n\
5715Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005716done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
5718static PyObject *
5719unicode_ljust(PyUnicodeObject *self, PyObject *args)
5720{
Martin v. Löwis412fb672006-04-13 06:34:32 +00005721 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005722 Py_UNICODE fillchar = ' ';
5723
Martin v. Löwis412fb672006-04-13 06:34:32 +00005724 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 return NULL;
5726
Tim Peters7a29bd52001-09-12 03:03:31 +00005727 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 Py_INCREF(self);
5729 return (PyObject*) self;
5730 }
5731
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005732 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733}
5734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005735PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736"S.lower() -> unicode\n\
5737\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005738Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
5740static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005741unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return fixup(self, fixlower);
5744}
5745
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005746#define LEFTSTRIP 0
5747#define RIGHTSTRIP 1
5748#define BOTHSTRIP 2
5749
5750/* Arrays indexed by above */
5751static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5752
5753#define STRIPNAME(i) (stripformat[i]+3)
5754
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005755/* externally visible for str.strip(unicode) */
5756PyObject *
5757_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5758{
5759 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005760 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005761 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005762 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
5763 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005764
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005765 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
5766
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005767 i = 0;
5768 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005769 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
5770 i++;
5771 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005772 }
5773
5774 j = len;
5775 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005776 do {
5777 j--;
5778 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
5779 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005780 }
5781
5782 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005783 Py_INCREF(self);
5784 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005785 }
5786 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005787 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005788}
5789
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790
5791static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005792do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005794 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00005795 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005796
5797 i = 0;
5798 if (striptype != RIGHTSTRIP) {
5799 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5800 i++;
5801 }
5802 }
5803
5804 j = len;
5805 if (striptype != LEFTSTRIP) {
5806 do {
5807 j--;
5808 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5809 j++;
5810 }
5811
5812 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5813 Py_INCREF(self);
5814 return (PyObject*)self;
5815 }
5816 else
5817 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818}
5819
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005820
5821static PyObject *
5822do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5823{
5824 PyObject *sep = NULL;
5825
5826 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5827 return NULL;
5828
5829 if (sep != NULL && sep != Py_None) {
5830 if (PyUnicode_Check(sep))
5831 return _PyUnicode_XStrip(self, striptype, sep);
5832 else if (PyString_Check(sep)) {
5833 PyObject *res;
5834 sep = PyUnicode_FromObject(sep);
5835 if (sep==NULL)
5836 return NULL;
5837 res = _PyUnicode_XStrip(self, striptype, sep);
5838 Py_DECREF(sep);
5839 return res;
5840 }
5841 else {
5842 PyErr_Format(PyExc_TypeError,
5843 "%s arg must be None, unicode or str",
5844 STRIPNAME(striptype));
5845 return NULL;
5846 }
5847 }
5848
5849 return do_strip(self, striptype);
5850}
5851
5852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005853PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005854"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005855\n\
5856Return a copy of the string S with leading and trailing\n\
5857whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005858If chars is given and not None, remove characters in chars instead.\n\
5859If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005860
5861static PyObject *
5862unicode_strip(PyUnicodeObject *self, PyObject *args)
5863{
5864 if (PyTuple_GET_SIZE(args) == 0)
5865 return do_strip(self, BOTHSTRIP); /* Common case */
5866 else
5867 return do_argstrip(self, BOTHSTRIP, args);
5868}
5869
5870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005871PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005872"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005873\n\
5874Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005875If chars is given and not None, remove characters in chars instead.\n\
5876If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005877
5878static PyObject *
5879unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5880{
5881 if (PyTuple_GET_SIZE(args) == 0)
5882 return do_strip(self, LEFTSTRIP); /* Common case */
5883 else
5884 return do_argstrip(self, LEFTSTRIP, args);
5885}
5886
5887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005888PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005889"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005890\n\
5891Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005892If chars is given and not None, remove characters in chars instead.\n\
5893If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005894
5895static PyObject *
5896unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5897{
5898 if (PyTuple_GET_SIZE(args) == 0)
5899 return do_strip(self, RIGHTSTRIP); /* Common case */
5900 else
5901 return do_argstrip(self, RIGHTSTRIP, args);
5902}
5903
5904
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907{
5908 PyUnicodeObject *u;
5909 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005910 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00005911 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912
5913 if (len < 0)
5914 len = 0;
5915
Tim Peters7a29bd52001-09-12 03:03:31 +00005916 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 /* no repeat, return original string */
5918 Py_INCREF(str);
5919 return (PyObject*) str;
5920 }
Tim Peters8f422462000-09-09 06:13:41 +00005921
5922 /* ensure # of chars needed doesn't overflow int and # of bytes
5923 * needed doesn't overflow size_t
5924 */
5925 nchars = len * str->length;
5926 if (len && nchars / len != str->length) {
5927 PyErr_SetString(PyExc_OverflowError,
5928 "repeated string is too long");
5929 return NULL;
5930 }
5931 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5932 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5933 PyErr_SetString(PyExc_OverflowError,
5934 "repeated string is too long");
5935 return NULL;
5936 }
5937 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 if (!u)
5939 return NULL;
5940
5941 p = u->str;
5942
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005943 if (str->length == 1 && len > 0) {
5944 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005945 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00005946 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005947 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00005948 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00005949 done = str->length;
5950 }
5951 while (done < nchars) {
5952 int n = (done <= nchars-done) ? done : nchars-done;
5953 Py_UNICODE_COPY(p+done, p, n);
5954 done += n;
5955 }
5956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
5958 return (PyObject*) u;
5959}
5960
5961PyObject *PyUnicode_Replace(PyObject *obj,
5962 PyObject *subobj,
5963 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005964 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
5966 PyObject *self;
5967 PyObject *str1;
5968 PyObject *str2;
5969 PyObject *result;
5970
5971 self = PyUnicode_FromObject(obj);
5972 if (self == NULL)
5973 return NULL;
5974 str1 = PyUnicode_FromObject(subobj);
5975 if (str1 == NULL) {
5976 Py_DECREF(self);
5977 return NULL;
5978 }
5979 str2 = PyUnicode_FromObject(replobj);
5980 if (str2 == NULL) {
5981 Py_DECREF(self);
5982 Py_DECREF(str1);
5983 return NULL;
5984 }
Tim Petersced69f82003-09-16 20:30:58 +00005985 result = replace((PyUnicodeObject *)self,
5986 (PyUnicodeObject *)str1,
5987 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 maxcount);
5989 Py_DECREF(self);
5990 Py_DECREF(str1);
5991 Py_DECREF(str2);
5992 return result;
5993}
5994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005995PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996"S.replace (old, new[, maxsplit]) -> unicode\n\
5997\n\
5998Return a copy of S with all occurrences of substring\n\
5999old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006000given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001
6002static PyObject*
6003unicode_replace(PyUnicodeObject *self, PyObject *args)
6004{
6005 PyUnicodeObject *str1;
6006 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 PyObject *result;
6009
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 return NULL;
6012 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6013 if (str1 == NULL)
6014 return NULL;
6015 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006016 if (str2 == NULL) {
6017 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021 result = replace(self, str1, str2, maxcount);
6022
6023 Py_DECREF(str1);
6024 Py_DECREF(str2);
6025 return result;
6026}
6027
6028static
6029PyObject *unicode_repr(PyObject *unicode)
6030{
6031 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6032 PyUnicode_GET_SIZE(unicode),
6033 1);
6034}
6035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006036PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037"S.rfind(sub [,start [,end]]) -> int\n\
6038\n\
6039Return the highest index in S where substring sub is found,\n\
6040such that sub is contained within s[start,end]. Optional\n\
6041arguments start and end are interpreted as in slice notation.\n\
6042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006043Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
6045static PyObject *
6046unicode_rfind(PyUnicodeObject *self, PyObject *args)
6047{
6048 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006049 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006050 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 PyObject *result;
6052
Guido van Rossumb8872e62000-05-09 14:14:27 +00006053 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6054 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 return NULL;
6056 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6057 (PyObject *)substring);
6058 if (substring == NULL)
6059 return NULL;
6060
Martin v. Löwis18e16552006-02-15 17:27:45 +00006061 result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
6063 Py_DECREF(substring);
6064 return result;
6065}
6066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006067PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068"S.rindex(sub [,start [,end]]) -> int\n\
6069\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006070Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071
6072static PyObject *
6073unicode_rindex(PyUnicodeObject *self, PyObject *args)
6074{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006075 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006077 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006078 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079
Guido van Rossumb8872e62000-05-09 14:14:27 +00006080 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6081 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 return NULL;
6083 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6084 (PyObject *)substring);
6085 if (substring == NULL)
6086 return NULL;
6087
6088 result = findstring(self, substring, start, end, -1);
6089
6090 Py_DECREF(substring);
6091 if (result < 0) {
6092 PyErr_SetString(PyExc_ValueError, "substring not found");
6093 return NULL;
6094 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006095 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096}
6097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006098PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006099"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100\n\
6101Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006102done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
6104static PyObject *
6105unicode_rjust(PyUnicodeObject *self, PyObject *args)
6106{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006107 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006108 Py_UNICODE fillchar = ' ';
6109
Martin v. Löwis412fb672006-04-13 06:34:32 +00006110 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return NULL;
6112
Tim Peters7a29bd52001-09-12 03:03:31 +00006113 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 Py_INCREF(self);
6115 return (PyObject*) self;
6116 }
6117
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006118 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119}
6120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006122unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123{
6124 /* standard clamping */
6125 if (start < 0)
6126 start = 0;
6127 if (end < 0)
6128 end = 0;
6129 if (end > self->length)
6130 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006131 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 /* full slice, return original string */
6133 Py_INCREF(self);
6134 return (PyObject*) self;
6135 }
6136 if (start > end)
6137 start = end;
6138 /* copy slice */
6139 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6140 end - start);
6141}
6142
6143PyObject *PyUnicode_Split(PyObject *s,
6144 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006145 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146{
6147 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006148
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 s = PyUnicode_FromObject(s);
6150 if (s == NULL)
6151 return NULL;
6152 if (sep != NULL) {
6153 sep = PyUnicode_FromObject(sep);
6154 if (sep == NULL) {
6155 Py_DECREF(s);
6156 return NULL;
6157 }
6158 }
6159
6160 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6161
6162 Py_DECREF(s);
6163 Py_XDECREF(sep);
6164 return result;
6165}
6166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006167PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168"S.split([sep [,maxsplit]]) -> list of strings\n\
6169\n\
6170Return a list of the words in S, using sep as the\n\
6171delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006172splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006173any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
6175static PyObject*
6176unicode_split(PyUnicodeObject *self, PyObject *args)
6177{
6178 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006179 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Martin v. Löwis18e16552006-02-15 17:27:45 +00006181 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 return NULL;
6183
6184 if (substring == Py_None)
6185 return split(self, NULL, maxcount);
6186 else if (PyUnicode_Check(substring))
6187 return split(self, (PyUnicodeObject *)substring, maxcount);
6188 else
6189 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6190}
6191
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006192PyObject *PyUnicode_RSplit(PyObject *s,
6193 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006194 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006195{
6196 PyObject *result;
6197
6198 s = PyUnicode_FromObject(s);
6199 if (s == NULL)
6200 return NULL;
6201 if (sep != NULL) {
6202 sep = PyUnicode_FromObject(sep);
6203 if (sep == NULL) {
6204 Py_DECREF(s);
6205 return NULL;
6206 }
6207 }
6208
6209 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6210
6211 Py_DECREF(s);
6212 Py_XDECREF(sep);
6213 return result;
6214}
6215
6216PyDoc_STRVAR(rsplit__doc__,
6217"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6218\n\
6219Return a list of the words in S, using sep as the\n\
6220delimiter string, starting at the end of the string and\n\
6221working to the front. If maxsplit is given, at most maxsplit\n\
6222splits are done. If sep is not specified, any whitespace string\n\
6223is a separator.");
6224
6225static PyObject*
6226unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6227{
6228 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006229 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006230
Martin v. Löwis18e16552006-02-15 17:27:45 +00006231 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006232 return NULL;
6233
6234 if (substring == Py_None)
6235 return rsplit(self, NULL, maxcount);
6236 else if (PyUnicode_Check(substring))
6237 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6238 else
6239 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6240}
6241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006242PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006243"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244\n\
6245Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006246Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006247is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248
6249static PyObject*
6250unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6251{
Guido van Rossum86662912000-04-11 15:38:46 +00006252 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253
Guido van Rossum86662912000-04-11 15:38:46 +00006254 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 return NULL;
6256
Guido van Rossum86662912000-04-11 15:38:46 +00006257 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258}
6259
6260static
6261PyObject *unicode_str(PyUnicodeObject *self)
6262{
Fred Drakee4315f52000-05-09 19:53:39 +00006263 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264}
6265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006266PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267"S.swapcase() -> unicode\n\
6268\n\
6269Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006270and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271
6272static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006273unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 return fixup(self, fixswapcase);
6276}
6277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006278PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279"S.translate(table) -> unicode\n\
6280\n\
6281Return a copy of the string S, where all characters have been mapped\n\
6282through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006283Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6284Unmapped characters are left untouched. Characters mapped to None\n\
6285are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
6287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006288unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289{
Tim Petersced69f82003-09-16 20:30:58 +00006290 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006292 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 "ignore");
6294}
6295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006296PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297"S.upper() -> unicode\n\
6298\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006299Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
6301static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006302unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 return fixup(self, fixupper);
6305}
6306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006307PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308"S.zfill(width) -> unicode\n\
6309\n\
6310Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006311of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
6313static PyObject *
6314unicode_zfill(PyUnicodeObject *self, PyObject *args)
6315{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006316 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 PyUnicodeObject *u;
6318
Martin v. Löwis18e16552006-02-15 17:27:45 +00006319 Py_ssize_t width;
6320 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 return NULL;
6322
6323 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006324 if (PyUnicode_CheckExact(self)) {
6325 Py_INCREF(self);
6326 return (PyObject*) self;
6327 }
6328 else
6329 return PyUnicode_FromUnicode(
6330 PyUnicode_AS_UNICODE(self),
6331 PyUnicode_GET_SIZE(self)
6332 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 }
6334
6335 fill = width - self->length;
6336
6337 u = pad(self, fill, 0, '0');
6338
Walter Dörwald068325e2002-04-15 13:36:47 +00006339 if (u == NULL)
6340 return NULL;
6341
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 if (u->str[fill] == '+' || u->str[fill] == '-') {
6343 /* move sign to beginning of string */
6344 u->str[0] = u->str[fill];
6345 u->str[fill] = '0';
6346 }
6347
6348 return (PyObject*) u;
6349}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350
6351#if 0
6352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006353unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 return PyInt_FromLong(unicode_freelist_size);
6356}
6357#endif
6358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006359PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006360"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006362Return True if S starts with the specified prefix, False otherwise.\n\
6363With optional start, test S beginning at that position.\n\
6364With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365
6366static PyObject *
6367unicode_startswith(PyUnicodeObject *self,
6368 PyObject *args)
6369{
6370 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006371 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006372 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 PyObject *result;
6374
Guido van Rossumb8872e62000-05-09 14:14:27 +00006375 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6376 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 return NULL;
6378 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6379 (PyObject *)substring);
6380 if (substring == NULL)
6381 return NULL;
6382
Guido van Rossum77f6a652002-04-03 22:41:51 +00006383 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
6385 Py_DECREF(substring);
6386 return result;
6387}
6388
6389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006390PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006391"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006393Return True if S ends with the specified suffix, False otherwise.\n\
6394With optional start, test S beginning at that position.\n\
6395With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396
6397static PyObject *
6398unicode_endswith(PyUnicodeObject *self,
6399 PyObject *args)
6400{
6401 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006403 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 PyObject *result;
6405
Guido van Rossumb8872e62000-05-09 14:14:27 +00006406 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6407 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 return NULL;
6409 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6410 (PyObject *)substring);
6411 if (substring == NULL)
6412 return NULL;
6413
Guido van Rossum77f6a652002-04-03 22:41:51 +00006414 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
6416 Py_DECREF(substring);
6417 return result;
6418}
6419
6420
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006421
6422static PyObject *
6423unicode_getnewargs(PyUnicodeObject *v)
6424{
6425 return Py_BuildValue("(u#)", v->str, v->length);
6426}
6427
6428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429static PyMethodDef unicode_methods[] = {
6430
6431 /* Order is according to common usage: often used methods should
6432 appear first, since lookup is done sequentially. */
6433
Georg Brandlecdc0a92006-03-30 12:19:07 +00006434 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006435 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6436 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006437 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006438 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6439 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6440 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6441 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6442 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6443 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6444 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6445 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6446 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6447 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006448 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006449 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006450/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6451 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6452 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6453 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006454 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006455 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006456 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006457 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6458 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6459 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6460 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6461 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6462 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6463 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6464 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6465 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6466 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6467 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6468 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6469 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6470 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006471 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006472#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006473 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474#endif
6475
6476#if 0
6477 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006478 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479#endif
6480
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006481 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 {NULL, NULL}
6483};
6484
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006485static PyObject *
6486unicode_mod(PyObject *v, PyObject *w)
6487{
6488 if (!PyUnicode_Check(v)) {
6489 Py_INCREF(Py_NotImplemented);
6490 return Py_NotImplemented;
6491 }
6492 return PyUnicode_Format(v, w);
6493}
6494
6495static PyNumberMethods unicode_as_number = {
6496 0, /*nb_add*/
6497 0, /*nb_subtract*/
6498 0, /*nb_multiply*/
6499 0, /*nb_divide*/
6500 unicode_mod, /*nb_remainder*/
6501};
6502
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006504 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00006505 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006506 (ssizeargfunc) unicode_repeat, /* sq_repeat */
6507 (ssizeargfunc) unicode_getitem, /* sq_item */
6508 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 0, /* sq_ass_item */
6510 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00006511 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512};
6513
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006514#define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6515
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006516static PyObject*
6517unicode_subscript(PyUnicodeObject* self, PyObject* item)
6518{
Guido van Rossum38fff8c2006-03-07 18:50:55 +00006519 PyNumberMethods *nb = item->ob_type->tp_as_number;
6520 if (nb != NULL && HASINDEX(item) && nb->nb_index != NULL) {
6521 Py_ssize_t i = nb->nb_index(item);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006522 if (i == -1 && PyErr_Occurred())
6523 return NULL;
6524 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006525 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006526 return unicode_getitem(self, i);
6527 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006529 Py_UNICODE* source_buf;
6530 Py_UNICODE* result_buf;
6531 PyObject* result;
6532
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006533 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006534 &start, &stop, &step, &slicelength) < 0) {
6535 return NULL;
6536 }
6537
6538 if (slicelength <= 0) {
6539 return PyUnicode_FromUnicode(NULL, 0);
6540 } else {
6541 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00006542 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
6543 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006544
6545 if (result_buf == NULL)
6546 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006547
6548 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6549 result_buf[i] = source_buf[cur];
6550 }
Tim Petersced69f82003-09-16 20:30:58 +00006551
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006552 result = PyUnicode_FromUnicode(result_buf, slicelength);
6553 PyMem_FREE(result_buf);
6554 return result;
6555 }
6556 } else {
6557 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6558 return NULL;
6559 }
6560}
6561
6562static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006563 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006564 (binaryfunc)unicode_subscript, /* mp_subscript */
6565 (objobjargproc)0, /* mp_ass_subscript */
6566};
6567
Martin v. Löwis18e16552006-02-15 17:27:45 +00006568static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006570 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 const void **ptr)
6572{
6573 if (index != 0) {
6574 PyErr_SetString(PyExc_SystemError,
6575 "accessing non-existent unicode segment");
6576 return -1;
6577 }
6578 *ptr = (void *) self->str;
6579 return PyUnicode_GET_DATA_SIZE(self);
6580}
6581
Martin v. Löwis18e16552006-02-15 17:27:45 +00006582static Py_ssize_t
6583unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 const void **ptr)
6585{
6586 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006587 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 return -1;
6589}
6590
6591static int
6592unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006593 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594{
6595 if (lenp)
6596 *lenp = PyUnicode_GET_DATA_SIZE(self);
6597 return 1;
6598}
6599
Martin v. Löwiseb079f12006-02-16 14:32:27 +00006600static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006602 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 const void **ptr)
6604{
6605 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006606
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 if (index != 0) {
6608 PyErr_SetString(PyExc_SystemError,
6609 "accessing non-existent unicode segment");
6610 return -1;
6611 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006612 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 if (str == NULL)
6614 return -1;
6615 *ptr = (void *) PyString_AS_STRING(str);
6616 return PyString_GET_SIZE(str);
6617}
6618
6619/* Helpers for PyUnicode_Format() */
6620
6621static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006622getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006624 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 if (argidx < arglen) {
6626 (*p_argidx)++;
6627 if (arglen < 0)
6628 return args;
6629 else
6630 return PyTuple_GetItem(args, argidx);
6631 }
6632 PyErr_SetString(PyExc_TypeError,
6633 "not enough arguments for format string");
6634 return NULL;
6635}
6636
6637#define F_LJUST (1<<0)
6638#define F_SIGN (1<<1)
6639#define F_BLANK (1<<2)
6640#define F_ALT (1<<3)
6641#define F_ZERO (1<<4)
6642
Martin v. Löwis18e16552006-02-15 17:27:45 +00006643static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00006644strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006646 register Py_ssize_t i;
6647 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 for (i = len - 1; i >= 0; i--)
6649 buffer[i] = (Py_UNICODE) charbuffer[i];
6650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 return len;
6652}
6653
Neal Norwitzfc76d632006-01-10 06:03:13 +00006654static int
6655doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
6656{
Tim Peters15231542006-02-16 01:08:01 +00006657 Py_ssize_t result;
6658
Neal Norwitzfc76d632006-01-10 06:03:13 +00006659 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006660 result = strtounicode(buffer, (char *)buffer);
6661 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006662}
6663
6664static int
6665longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
6666{
Tim Peters15231542006-02-16 01:08:01 +00006667 Py_ssize_t result;
6668
Neal Norwitzfc76d632006-01-10 06:03:13 +00006669 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00006670 result = strtounicode(buffer, (char *)buffer);
6671 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006672}
6673
Guido van Rossum078151d2002-08-11 04:24:12 +00006674/* XXX To save some code duplication, formatfloat/long/int could have been
6675 shared with stringobject.c, converting from 8-bit to Unicode after the
6676 formatting is done. */
6677
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678static int
6679formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006680 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 int flags,
6682 int prec,
6683 int type,
6684 PyObject *v)
6685{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006686 /* fmt = '%#.' + `prec` + `type`
6687 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 char fmt[20];
6689 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006690
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 x = PyFloat_AsDouble(v);
6692 if (x == -1.0 && PyErr_Occurred())
6693 return -1;
6694 if (prec < 0)
6695 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6697 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006698 /* Worst case length calc to ensure no buffer overrun:
6699
6700 'g' formats:
6701 fmt = %#.<prec>g
6702 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6703 for any double rep.)
6704 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6705
6706 'f' formats:
6707 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6708 len = 1 + 50 + 1 + prec = 52 + prec
6709
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006710 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006711 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006712
6713 */
6714 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6715 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006716 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006717 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006718 return -1;
6719 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006720 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6721 (flags&F_ALT) ? "#" : "",
6722 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00006723 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724}
6725
Tim Peters38fd5b62000-09-21 05:43:11 +00006726static PyObject*
6727formatlong(PyObject *val, int flags, int prec, int type)
6728{
6729 char *buf;
6730 int i, len;
6731 PyObject *str; /* temporary string object. */
6732 PyUnicodeObject *result;
6733
6734 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6735 if (!str)
6736 return NULL;
6737 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006738 if (!result) {
6739 Py_DECREF(str);
6740 return NULL;
6741 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006742 for (i = 0; i < len; i++)
6743 result->str[i] = buf[i];
6744 result->str[len] = 0;
6745 Py_DECREF(str);
6746 return (PyObject*)result;
6747}
6748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749static int
6750formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006751 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 int flags,
6753 int prec,
6754 int type,
6755 PyObject *v)
6756{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006757 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006758 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6759 * + 1 + 1
6760 * = 24
6761 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006762 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006763 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 long x;
6765
6766 x = PyInt_AsLong(v);
6767 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006768 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006769 if (x < 0 && type == 'u') {
6770 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006771 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006772 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6773 sign = "-";
6774 else
6775 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006777 prec = 1;
6778
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006779 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6780 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006781 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006782 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006783 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006784 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006785 return -1;
6786 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006787
6788 if ((flags & F_ALT) &&
6789 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006790 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006791 * of issues that cause pain:
6792 * - when 0 is being converted, the C standard leaves off
6793 * the '0x' or '0X', which is inconsistent with other
6794 * %#x/%#X conversions and inconsistent with Python's
6795 * hex() function
6796 * - there are platforms that violate the standard and
6797 * convert 0 with the '0x' or '0X'
6798 * (Metrowerks, Compaq Tru64)
6799 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006800 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006801 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006802 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006803 * We can achieve the desired consistency by inserting our
6804 * own '0x' or '0X' prefix, and substituting %x/%X in place
6805 * of %#x/%#X.
6806 *
6807 * Note that this is the same approach as used in
6808 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006809 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006810 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6811 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006812 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006813 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006814 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6815 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006816 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006817 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006818 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00006819 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006820 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00006821 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822}
6823
6824static int
6825formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006826 size_t buflen,
6827 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006829 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006830 if (PyUnicode_Check(v)) {
6831 if (PyUnicode_GET_SIZE(v) != 1)
6832 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006836 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006837 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006838 goto onError;
6839 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
6842 else {
6843 /* Integer input truncated to a character */
6844 long x;
6845 x = PyInt_AsLong(v);
6846 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006847 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006848#ifdef Py_UNICODE_WIDE
6849 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006850 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006851 "%c arg not in range(0x110000) "
6852 "(wide Python build)");
6853 return -1;
6854 }
6855#else
6856 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006857 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006858 "%c arg not in range(0x10000) "
6859 "(narrow Python build)");
6860 return -1;
6861 }
6862#endif
6863 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 }
6865 buf[1] = '\0';
6866 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006867
6868 onError:
6869 PyErr_SetString(PyExc_TypeError,
6870 "%c requires int or char");
6871 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872}
6873
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006874/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6875
6876 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6877 chars are formatted. XXX This is a magic number. Each formatting
6878 routine does bounds checking to ensure no overflow, but a better
6879 solution may be to malloc a buffer of appropriate size for each
6880 format. For now, the current solution is sufficient.
6881*/
6882#define FORMATBUFLEN (size_t)120
6883
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884PyObject *PyUnicode_Format(PyObject *format,
6885 PyObject *args)
6886{
6887 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006888 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 int args_owned = 0;
6890 PyUnicodeObject *result = NULL;
6891 PyObject *dict = NULL;
6892 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006893
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 if (format == NULL || args == NULL) {
6895 PyErr_BadInternalCall();
6896 return NULL;
6897 }
6898 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006899 if (uformat == NULL)
6900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 fmt = PyUnicode_AS_UNICODE(uformat);
6902 fmtcnt = PyUnicode_GET_SIZE(uformat);
6903
6904 reslen = rescnt = fmtcnt + 100;
6905 result = _PyUnicode_New(reslen);
6906 if (result == NULL)
6907 goto onError;
6908 res = PyUnicode_AS_UNICODE(result);
6909
6910 if (PyTuple_Check(args)) {
6911 arglen = PyTuple_Size(args);
6912 argidx = 0;
6913 }
6914 else {
6915 arglen = -1;
6916 argidx = -2;
6917 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006918 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6919 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 dict = args;
6921
6922 while (--fmtcnt >= 0) {
6923 if (*fmt != '%') {
6924 if (--rescnt < 0) {
6925 rescnt = fmtcnt + 100;
6926 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006927 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006928 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6930 --rescnt;
6931 }
6932 *res++ = *fmt++;
6933 }
6934 else {
6935 /* Got a format specifier */
6936 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006937 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 Py_UNICODE c = '\0';
6940 Py_UNICODE fill;
6941 PyObject *v = NULL;
6942 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006943 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006945 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006946 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947
6948 fmt++;
6949 if (*fmt == '(') {
6950 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006951 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 PyObject *key;
6953 int pcount = 1;
6954
6955 if (dict == NULL) {
6956 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006957 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 goto onError;
6959 }
6960 ++fmt;
6961 --fmtcnt;
6962 keystart = fmt;
6963 /* Skip over balanced parentheses */
6964 while (pcount > 0 && --fmtcnt >= 0) {
6965 if (*fmt == ')')
6966 --pcount;
6967 else if (*fmt == '(')
6968 ++pcount;
6969 fmt++;
6970 }
6971 keylen = fmt - keystart - 1;
6972 if (fmtcnt < 0 || pcount > 0) {
6973 PyErr_SetString(PyExc_ValueError,
6974 "incomplete format key");
6975 goto onError;
6976 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006977#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006978 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 then looked up since Python uses strings to hold
6980 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006981 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 key = PyUnicode_EncodeUTF8(keystart,
6983 keylen,
6984 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006985#else
6986 key = PyUnicode_FromUnicode(keystart, keylen);
6987#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 if (key == NULL)
6989 goto onError;
6990 if (args_owned) {
6991 Py_DECREF(args);
6992 args_owned = 0;
6993 }
6994 args = PyObject_GetItem(dict, key);
6995 Py_DECREF(key);
6996 if (args == NULL) {
6997 goto onError;
6998 }
6999 args_owned = 1;
7000 arglen = -1;
7001 argidx = -2;
7002 }
7003 while (--fmtcnt >= 0) {
7004 switch (c = *fmt++) {
7005 case '-': flags |= F_LJUST; continue;
7006 case '+': flags |= F_SIGN; continue;
7007 case ' ': flags |= F_BLANK; continue;
7008 case '#': flags |= F_ALT; continue;
7009 case '0': flags |= F_ZERO; continue;
7010 }
7011 break;
7012 }
7013 if (c == '*') {
7014 v = getnextarg(args, arglen, &argidx);
7015 if (v == NULL)
7016 goto onError;
7017 if (!PyInt_Check(v)) {
7018 PyErr_SetString(PyExc_TypeError,
7019 "* wants int");
7020 goto onError;
7021 }
7022 width = PyInt_AsLong(v);
7023 if (width < 0) {
7024 flags |= F_LJUST;
7025 width = -width;
7026 }
7027 if (--fmtcnt >= 0)
7028 c = *fmt++;
7029 }
7030 else if (c >= '0' && c <= '9') {
7031 width = c - '0';
7032 while (--fmtcnt >= 0) {
7033 c = *fmt++;
7034 if (c < '0' || c > '9')
7035 break;
7036 if ((width*10) / 10 != width) {
7037 PyErr_SetString(PyExc_ValueError,
7038 "width too big");
7039 goto onError;
7040 }
7041 width = width*10 + (c - '0');
7042 }
7043 }
7044 if (c == '.') {
7045 prec = 0;
7046 if (--fmtcnt >= 0)
7047 c = *fmt++;
7048 if (c == '*') {
7049 v = getnextarg(args, arglen, &argidx);
7050 if (v == NULL)
7051 goto onError;
7052 if (!PyInt_Check(v)) {
7053 PyErr_SetString(PyExc_TypeError,
7054 "* wants int");
7055 goto onError;
7056 }
7057 prec = PyInt_AsLong(v);
7058 if (prec < 0)
7059 prec = 0;
7060 if (--fmtcnt >= 0)
7061 c = *fmt++;
7062 }
7063 else if (c >= '0' && c <= '9') {
7064 prec = c - '0';
7065 while (--fmtcnt >= 0) {
7066 c = Py_CHARMASK(*fmt++);
7067 if (c < '0' || c > '9')
7068 break;
7069 if ((prec*10) / 10 != prec) {
7070 PyErr_SetString(PyExc_ValueError,
7071 "prec too big");
7072 goto onError;
7073 }
7074 prec = prec*10 + (c - '0');
7075 }
7076 }
7077 } /* prec */
7078 if (fmtcnt >= 0) {
7079 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 if (--fmtcnt >= 0)
7081 c = *fmt++;
7082 }
7083 }
7084 if (fmtcnt < 0) {
7085 PyErr_SetString(PyExc_ValueError,
7086 "incomplete format");
7087 goto onError;
7088 }
7089 if (c != '%') {
7090 v = getnextarg(args, arglen, &argidx);
7091 if (v == NULL)
7092 goto onError;
7093 }
7094 sign = 0;
7095 fill = ' ';
7096 switch (c) {
7097
7098 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007099 pbuf = formatbuf;
7100 /* presume that buffer length is at least 1 */
7101 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 len = 1;
7103 break;
7104
7105 case 's':
7106 case 'r':
7107 if (PyUnicode_Check(v) && c == 's') {
7108 temp = v;
7109 Py_INCREF(temp);
7110 }
7111 else {
7112 PyObject *unicode;
7113 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007114 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115 else
7116 temp = PyObject_Repr(v);
7117 if (temp == NULL)
7118 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007119 if (PyUnicode_Check(temp))
7120 /* nothing to do */;
7121 else if (PyString_Check(temp)) {
7122 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007123 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007125 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007127 Py_DECREF(temp);
7128 temp = unicode;
7129 if (temp == NULL)
7130 goto onError;
7131 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007132 else {
7133 Py_DECREF(temp);
7134 PyErr_SetString(PyExc_TypeError,
7135 "%s argument has non-string str()");
7136 goto onError;
7137 }
7138 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007139 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 len = PyUnicode_GET_SIZE(temp);
7141 if (prec >= 0 && len > prec)
7142 len = prec;
7143 break;
7144
7145 case 'i':
7146 case 'd':
7147 case 'u':
7148 case 'o':
7149 case 'x':
7150 case 'X':
7151 if (c == 'i')
7152 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007153 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007154 temp = formatlong(v, flags, prec, c);
7155 if (!temp)
7156 goto onError;
7157 pbuf = PyUnicode_AS_UNICODE(temp);
7158 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007159 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007161 else {
7162 pbuf = formatbuf;
7163 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7164 flags, prec, c, v);
7165 if (len < 0)
7166 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007167 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007168 }
7169 if (flags & F_ZERO)
7170 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 break;
7172
7173 case 'e':
7174 case 'E':
7175 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007176 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177 case 'g':
7178 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007179 if (c == 'F')
7180 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007181 pbuf = formatbuf;
7182 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7183 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 if (len < 0)
7185 goto onError;
7186 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007187 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 fill = '0';
7189 break;
7190
7191 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007192 pbuf = formatbuf;
7193 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 if (len < 0)
7195 goto onError;
7196 break;
7197
7198 default:
7199 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007200 "unsupported format character '%c' (0x%x) "
7201 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007202 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007203 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007204 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 goto onError;
7206 }
7207 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007208 if (*pbuf == '-' || *pbuf == '+') {
7209 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 len--;
7211 }
7212 else if (flags & F_SIGN)
7213 sign = '+';
7214 else if (flags & F_BLANK)
7215 sign = ' ';
7216 else
7217 sign = 0;
7218 }
7219 if (width < len)
7220 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007221 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 reslen -= rescnt;
7223 rescnt = width + fmtcnt + 100;
7224 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007225 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007226 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00007227 PyErr_NoMemory();
7228 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007229 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00007230 if (_PyUnicode_Resize(&result, reslen) < 0) {
7231 Py_XDECREF(temp);
7232 goto onError;
7233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234 res = PyUnicode_AS_UNICODE(result)
7235 + reslen - rescnt;
7236 }
7237 if (sign) {
7238 if (fill != ' ')
7239 *res++ = sign;
7240 rescnt--;
7241 if (width > len)
7242 width--;
7243 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007244 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7245 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007246 assert(pbuf[1] == c);
7247 if (fill != ' ') {
7248 *res++ = *pbuf++;
7249 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007250 }
Tim Petersfff53252001-04-12 18:38:48 +00007251 rescnt -= 2;
7252 width -= 2;
7253 if (width < 0)
7254 width = 0;
7255 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 if (width > len && !(flags & F_LJUST)) {
7258 do {
7259 --rescnt;
7260 *res++ = fill;
7261 } while (--width > len);
7262 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007263 if (fill == ' ') {
7264 if (sign)
7265 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007266 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007267 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007268 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007269 *res++ = *pbuf++;
7270 *res++ = *pbuf++;
7271 }
7272 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007273 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 res += len;
7275 rescnt -= len;
7276 while (--width >= len) {
7277 --rescnt;
7278 *res++ = ' ';
7279 }
7280 if (dict && (argidx < arglen) && c != '%') {
7281 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007282 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007283 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 goto onError;
7285 }
7286 Py_XDECREF(temp);
7287 } /* '%' */
7288 } /* until end */
7289 if (argidx < arglen && !dict) {
7290 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007291 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 goto onError;
7293 }
7294
Thomas Woutersa96affe2006-03-12 00:29:36 +00007295 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7296 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 if (args_owned) {
7298 Py_DECREF(args);
7299 }
7300 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 return (PyObject *)result;
7302
7303 onError:
7304 Py_XDECREF(result);
7305 Py_DECREF(uformat);
7306 if (args_owned) {
7307 Py_DECREF(args);
7308 }
7309 return NULL;
7310}
7311
7312static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007313 (readbufferproc) unicode_buffer_getreadbuf,
7314 (writebufferproc) unicode_buffer_getwritebuf,
7315 (segcountproc) unicode_buffer_getsegcount,
7316 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317};
7318
Jeremy Hylton938ace62002-07-17 16:30:39 +00007319static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007320unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7321
Tim Peters6d6c1a32001-08-02 04:15:00 +00007322static PyObject *
7323unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7324{
7325 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00007326 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007327 char *encoding = NULL;
7328 char *errors = NULL;
7329
Guido van Rossume023fe02001-08-30 03:12:59 +00007330 if (type != &PyUnicode_Type)
7331 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007332 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7333 kwlist, &x, &encoding, &errors))
7334 return NULL;
7335 if (x == NULL)
7336 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007337 if (encoding == NULL && errors == NULL)
7338 return PyObject_Unicode(x);
7339 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007340 return PyUnicode_FromEncodedObject(x, encoding, errors);
7341}
7342
Guido van Rossume023fe02001-08-30 03:12:59 +00007343static PyObject *
7344unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7345{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007346 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007347 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00007348
7349 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7350 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7351 if (tmp == NULL)
7352 return NULL;
7353 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007354 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007355 if (pnew == NULL) {
7356 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007357 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007358 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007359 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7360 if (pnew->str == NULL) {
7361 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007362 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007363 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007364 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007365 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007366 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7367 pnew->length = n;
7368 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007369 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007370 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007371}
7372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007373PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007374"unicode(string [, encoding[, errors]]) -> object\n\
7375\n\
7376Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007377encoding defaults to the current default string encoding.\n\
7378errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007379
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380PyTypeObject PyUnicode_Type = {
7381 PyObject_HEAD_INIT(&PyType_Type)
7382 0, /* ob_size */
7383 "unicode", /* tp_name */
7384 sizeof(PyUnicodeObject), /* tp_size */
7385 0, /* tp_itemsize */
7386 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007387 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007389 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 0, /* tp_setattr */
7391 (cmpfunc) unicode_compare, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00007392 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007393 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007395 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 (hashfunc) unicode_hash, /* tp_hash*/
7397 0, /* tp_call*/
7398 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007399 PyObject_GenericGetAttr, /* tp_getattro */
7400 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007402 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7403 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007404 unicode_doc, /* tp_doc */
7405 0, /* tp_traverse */
7406 0, /* tp_clear */
7407 0, /* tp_richcompare */
7408 0, /* tp_weaklistoffset */
7409 0, /* tp_iter */
7410 0, /* tp_iternext */
7411 unicode_methods, /* tp_methods */
7412 0, /* tp_members */
7413 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007414 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007415 0, /* tp_dict */
7416 0, /* tp_descr_get */
7417 0, /* tp_descr_set */
7418 0, /* tp_dictoffset */
7419 0, /* tp_init */
7420 0, /* tp_alloc */
7421 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007422 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423};
7424
7425/* Initialize the Unicode implementation */
7426
Thomas Wouters78890102000-07-22 19:25:51 +00007427void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007429 int i;
7430
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007431 /* XXX - move this array to unicodectype.c ? */
7432 Py_UNICODE linebreak[] = {
7433 0x000A, /* LINE FEED */
7434 0x000D, /* CARRIAGE RETURN */
7435 0x001C, /* FILE SEPARATOR */
7436 0x001D, /* GROUP SEPARATOR */
7437 0x001E, /* RECORD SEPARATOR */
7438 0x0085, /* NEXT LINE */
7439 0x2028, /* LINE SEPARATOR */
7440 0x2029, /* PARAGRAPH SEPARATOR */
7441 };
7442
Fred Drakee4315f52000-05-09 19:53:39 +00007443 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007444 unicode_freelist = NULL;
7445 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007447 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007448 for (i = 0; i < 256; i++)
7449 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007450 if (PyType_Ready(&PyUnicode_Type) < 0)
7451 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007452
7453 /* initialize the linebreak bloom filter */
7454 bloom_linebreak = make_bloom_mask(
7455 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7456 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457}
7458
7459/* Finalize the Unicode implementation */
7460
7461void
Thomas Wouters78890102000-07-22 19:25:51 +00007462_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007464 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007465 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007467 Py_XDECREF(unicode_empty);
7468 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007469
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007470 for (i = 0; i < 256; i++) {
7471 if (unicode_latin1[i]) {
7472 Py_DECREF(unicode_latin1[i]);
7473 unicode_latin1[i] = NULL;
7474 }
7475 }
7476
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007477 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478 PyUnicodeObject *v = u;
7479 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007480 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007481 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007482 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007483 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007485 unicode_freelist = NULL;
7486 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007488
Anthony Baxterac6bd462006-04-13 02:06:09 +00007489#ifdef __cplusplus
7490}
7491#endif
7492
7493
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007494/*
7495Local variables:
7496c-basic-offset: 4
7497indent-tabs-mode: nil
7498End:
7499*/