blob: f54e7b23a829e95607b06f0d511f3526128f71d0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000096static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000115Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000116PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000117{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000118#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
Fredrik Lundh77633512006-05-23 19:47:35 +0000166 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172/* --- Unicode Object ----------------------------------------------------- */
173
174static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000176 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177{
178 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000179
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000180 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000181 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000182 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000187
Martin v. Löwis80d2e592006-04-13 06:06:08 +0000188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000191 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000192 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 return -1;
195 }
196
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000199 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000200 it contains). */
201
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
Anthony Baxtera6286212006-04-11 07:42:36 +0000205 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000210 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000212 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000213 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 }
218 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000219
Guido van Rossumd57fd912000-03-10 22:53:23 +0000220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000224 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233{
234 register PyUnicodeObject *unicode;
235
Andrew Dalkee0df7622006-05-27 11:04:36 +0000236 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000245 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000250 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000251 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 }
255 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000256 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260 }
261 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000268 if (!unicode->str) {
269 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000270 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000271 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000272 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000279 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000283 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000288 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290}
291
292static
Guido van Rossum9475a232001-10-05 20:51:39 +0000293void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000299 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 unicode->str = NULL;
301 unicode->length = 0;
302 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000306 }
307 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311 }
312 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000313 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000314 Py_XDECREF(unicode->defenc);
Martin v. Löwis68192102007-07-21 06:55:02 +0000315 Py_Type(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316 }
317}
318
Martin v. Löwis18e16552006-02-15 17:27:45 +0000319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
Martin v. Löwis68192102007-07-21 06:55:02 +0000329 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000337 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000344 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
Guido van Rossumd57fd912000-03-10 22:53:23 +0000358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000359 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360{
361 PyUnicodeObject *unicode;
362
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000379 if (!unicode)
380 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000381 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
Tim Petersced69f82003-09-16 20:30:58 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000395 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000403 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000419#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000420 {
421 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000422 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000424 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
Martin v. Löwis18e16552006-02-15 17:27:45 +0000432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000440
441 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000450 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000451 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000452 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000453 *w++ = *u++;
454 }
455#endif
456
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return size;
461}
462
463#endif
464
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000467 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
Hye-Shik Chang40574832004-04-06 07:24:51 +0000485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000487}
488
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000511 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000518
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000524
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000533 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000534 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000535 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000536 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000542 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000549 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000554 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +0000557 Py_Type(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000558 goto onError;
559 }
Tim Petersced69f82003-09-16 20:30:58 +0000560
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000561 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 if (len == 0) {
563 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000564 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000565 }
Tim Petersced69f82003-09-16 20:30:58 +0000566 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000567 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000568
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000569 return v;
570
571 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573}
574
575PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000576 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000581
582 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000587 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000606 "decoder did not return an unicode object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000607 Py_Type(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000613
Guido van Rossumd57fd912000-03-10 22:53:23 +0000614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000687
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
Fred Drakee4315f52000-05-09 19:53:39 +0000692
Tim Petersced69f82003-09-16 20:30:58 +0000693 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000699 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000716 "encoder did not return a string object (type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +0000717 Py_Type(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000722
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723 onError:
724 return NULL;
725}
726
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
Martin v. Löwis18e16552006-02-15 17:27:45 +0000752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
Thomas Wouters78890102000-07-22 19:25:51 +0000764const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000772
Fred Drakee4315f52000-05-09 19:53:39 +0000773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000780 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000790 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
Walter Dörwald87578782007-08-30 15:30:09 +0000798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
799 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000800 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000801{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000802 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000803
804 PyObject *restuple = NULL;
805 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000806 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
807 Py_ssize_t requiredsize;
808 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000810 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811 int res = -1;
812
813 if (*errorHandler == NULL) {
814 *errorHandler = PyCodec_LookupError(errors);
815 if (*errorHandler == NULL)
816 goto onError;
817 }
818
819 if (*exceptionObject == NULL) {
820 *exceptionObject = PyUnicodeDecodeError_Create(
821 encoding, input, insize, *startinpos, *endinpos, reason);
822 if (*exceptionObject == NULL)
823 goto onError;
824 }
825 else {
826 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
827 goto onError;
828 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
829 goto onError;
830 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
831 goto onError;
832 }
833
834 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
835 if (restuple == NULL)
836 goto onError;
837 if (!PyTuple_Check(restuple)) {
838 PyErr_Format(PyExc_TypeError, &argparse[4]);
839 goto onError;
840 }
841 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
842 goto onError;
843 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000844 newpos = insize+newpos;
845 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +0000846 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000847 goto onError;
848 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849
850 /* need more space? (at least enough for what we
851 have+the replacement+the rest of the string (starting
852 at the new input position), so we won't have to check space
853 when there are no errors in the rest of the string) */
854 repptr = PyUnicode_AS_UNICODE(repunicode);
855 repsize = PyUnicode_GET_SIZE(repunicode);
856 requiredsize = *outpos + repsize + insize-newpos;
857 if (requiredsize > outsize) {
858 if (requiredsize<2*outsize)
859 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000860 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861 goto onError;
862 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
863 }
864 *endinpos = newpos;
865 *inptr = input + newpos;
866 Py_UNICODE_COPY(*outptr, repptr, repsize);
867 *outptr += repsize;
868 *outpos += repsize;
869 /* we made it! */
870 res = 0;
871
872 onError:
873 Py_XDECREF(restuple);
874 return res;
875}
876
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000877/* --- UTF-7 Codec -------------------------------------------------------- */
878
879/* see RFC2152 for details */
880
Tim Petersced69f82003-09-16 20:30:58 +0000881static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000882char utf7_special[128] = {
883 /* indicate whether a UTF-7 character is special i.e. cannot be directly
884 encoded:
885 0 - not special
886 1 - special
887 2 - whitespace (optional)
888 3 - RFC2152 Set O (optional) */
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
891 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
893 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
895 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
897
898};
899
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000900/* Note: The comparison (c) <= 0 is a trick to work-around gcc
901 warnings about the comparison always being false; since
902 utf7_special[0] is 1, we can safely make that one comparison
903 true */
904
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000905#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000906 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000907 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 (encodeO && (utf7_special[(c)] == 3)))
909
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000910#define B64(n) \
911 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
912#define B64CHAR(c) \
913 (isalnum(c) || (c) == '+' || (c) == '/')
914#define UB64(c) \
915 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
916 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000917
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000918#define ENCODE(out, ch, bits) \
919 while (bits >= 6) { \
920 *out++ = B64(ch >> (bits-6)); \
921 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000922 }
923
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000924#define DECODE(out, ch, bits, surrogate) \
925 while (bits >= 16) { \
926 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
927 bits -= 16; \
928 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000929 /* We have already generated an error for the high surrogate \
930 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000931 surrogate = 0; \
932 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000933 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000934 it in a 16-bit character */ \
935 surrogate = 1; \
936 errmsg = "code pairs are not supported"; \
937 goto utf7Error; \
938 } else { \
939 *out++ = outCh; \
940 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000941 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000942
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000943PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000944 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000945 const char *errors)
946{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000947 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000948 Py_ssize_t startinpos;
949 Py_ssize_t endinpos;
950 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000951 const char *e;
952 PyUnicodeObject *unicode;
953 Py_UNICODE *p;
954 const char *errmsg = "";
955 int inShift = 0;
956 unsigned int bitsleft = 0;
957 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000958 int surrogate = 0;
959 PyObject *errorHandler = NULL;
960 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000961
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
967
968 p = unicode->str;
969 e = s + size;
970
971 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_UNICODE ch;
973 restart:
974 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000975
976 if (inShift) {
977 if ((ch == '-') || !B64CHAR(ch)) {
978 inShift = 0;
979 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000980
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000981 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
982 if (bitsleft >= 6) {
983 /* The shift sequence has a partial character in it. If
984 bitsleft < 6 then we could just classify it as padding
985 but that is not the case here */
986
987 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000988 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000989 }
990 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000991 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000992 here so indicate the potential of a misencoded character. */
993
994 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
995 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
996 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000997 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000998 }
999
1000 if (ch == '-') {
1001 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001002 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003 inShift = 1;
1004 }
1005 } else if (SPECIAL(ch,0,0)) {
1006 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001007 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 } else {
1009 *p++ = ch;
1010 }
1011 } else {
1012 charsleft = (charsleft << 6) | UB64(ch);
1013 bitsleft += 6;
1014 s++;
1015 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1016 }
1017 }
1018 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001019 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 s++;
1021 if (s < e && *s == '-') {
1022 s++;
1023 *p++ = '+';
1024 } else
1025 {
1026 inShift = 1;
1027 bitsleft = 0;
1028 }
1029 }
1030 else if (SPECIAL(ch,0,0)) {
1031 errmsg = "unexpected special character";
1032 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001033 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001034 }
1035 else {
1036 *p++ = ch;
1037 s++;
1038 }
1039 continue;
1040 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001041 outpos = p-PyUnicode_AS_UNICODE(unicode);
1042 endinpos = s-starts;
1043 if (unicode_decode_call_errorhandler(
1044 errors, &errorHandler,
1045 "utf7", errmsg,
1046 starts, size, &startinpos, &endinpos, &exc, &s,
1047 (PyObject **)&unicode, &outpos, &p))
1048 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001049 }
1050
1051 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001052 outpos = p-PyUnicode_AS_UNICODE(unicode);
1053 endinpos = size;
1054 if (unicode_decode_call_errorhandler(
1055 errors, &errorHandler,
1056 "utf7", "unterminated shift sequence",
1057 starts, size, &startinpos, &endinpos, &exc, &s,
1058 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001060 if (s < e)
1061 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001062 }
1063
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001064 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001065 goto onError;
1066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001067 Py_XDECREF(errorHandler);
1068 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 return (PyObject *)unicode;
1070
1071onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001072 Py_XDECREF(errorHandler);
1073 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001074 Py_DECREF(unicode);
1075 return NULL;
1076}
1077
1078
1079PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001080 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001081 int encodeSetO,
1082 int encodeWhiteSpace,
1083 const char *errors)
1084{
1085 PyObject *v;
1086 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001087 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001088 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001089 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001090 unsigned int bitsleft = 0;
1091 unsigned long charsleft = 0;
1092 char * out;
1093 char * start;
1094
1095 if (size == 0)
1096 return PyString_FromStringAndSize(NULL, 0);
1097
1098 v = PyString_FromStringAndSize(NULL, cbAllocated);
1099 if (v == NULL)
1100 return NULL;
1101
1102 start = out = PyString_AS_STRING(v);
1103 for (;i < size; ++i) {
1104 Py_UNICODE ch = s[i];
1105
1106 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001107 if (ch == '+') {
1108 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 *out++ = '-';
1110 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1111 charsleft = ch;
1112 bitsleft = 16;
1113 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001114 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001116 } else {
1117 *out++ = (char) ch;
1118 }
1119 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001120 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1121 *out++ = B64(charsleft << (6-bitsleft));
1122 charsleft = 0;
1123 bitsleft = 0;
1124 /* Characters not in the BASE64 set implicitly unshift the sequence
1125 so no '-' is required, except if the character is itself a '-' */
1126 if (B64CHAR(ch) || ch == '-') {
1127 *out++ = '-';
1128 }
1129 inShift = 0;
1130 *out++ = (char) ch;
1131 } else {
1132 bitsleft += 16;
1133 charsleft = (charsleft << 16) | ch;
1134 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1135
1136 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001137 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001138 or '-' then the shift sequence will be terminated implicitly and we
1139 don't have to insert a '-'. */
1140
1141 if (bitsleft == 0) {
1142 if (i + 1 < size) {
1143 Py_UNICODE ch2 = s[i+1];
1144
1145 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001146
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001147 } else if (B64CHAR(ch2) || ch2 == '-') {
1148 *out++ = '-';
1149 inShift = 0;
1150 } else {
1151 inShift = 0;
1152 }
1153
1154 }
1155 else {
1156 *out++ = '-';
1157 inShift = 0;
1158 }
1159 }
Tim Petersced69f82003-09-16 20:30:58 +00001160 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001161 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001162 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001163 if (bitsleft) {
1164 *out++= B64(charsleft << (6-bitsleft) );
1165 *out++ = '-';
1166 }
1167
Tim Peters5de98422002-04-27 18:44:32 +00001168 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001169 return v;
1170}
1171
1172#undef SPECIAL
1173#undef B64
1174#undef B64CHAR
1175#undef UB64
1176#undef ENCODE
1177#undef DECODE
1178
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179/* --- UTF-8 Codec -------------------------------------------------------- */
1180
Tim Petersced69f82003-09-16 20:30:58 +00001181static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182char utf8_code_length[256] = {
1183 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1184 illegal prefix. see RFC 2279 for details */
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1199 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1200 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1201};
1202
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001204 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 const char *errors)
1206{
Walter Dörwald69652032004-09-07 20:24:22 +00001207 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1208}
1209
1210PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001211 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001212 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001213 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001214{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001215 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001217 Py_ssize_t startinpos;
1218 Py_ssize_t endinpos;
1219 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 const char *e;
1221 PyUnicodeObject *unicode;
1222 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001223 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001224 PyObject *errorHandler = NULL;
1225 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226
1227 /* Note: size will always be longer than the resulting Unicode
1228 character count */
1229 unicode = _PyUnicode_New(size);
1230 if (!unicode)
1231 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001232 if (size == 0) {
1233 if (consumed)
1234 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237
1238 /* Unpack UTF-8 encoded data */
1239 p = unicode->str;
1240 e = s + size;
1241
1242 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001243 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244
1245 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001246 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 s++;
1248 continue;
1249 }
1250
1251 n = utf8_code_length[ch];
1252
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001254 if (consumed)
1255 break;
1256 else {
1257 errmsg = "unexpected end of data";
1258 startinpos = s-starts;
1259 endinpos = size;
1260 goto utf8Error;
1261 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263
1264 switch (n) {
1265
1266 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001267 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001268 startinpos = s-starts;
1269 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001270 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271
1272 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001273 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 startinpos = s-starts;
1275 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001276 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277
1278 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001279 if ((s[1] & 0xc0) != 0x80) {
1280 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001281 startinpos = s-starts;
1282 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001283 goto utf8Error;
1284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001286 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287 startinpos = s-starts;
1288 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001289 errmsg = "illegal encoding";
1290 goto utf8Error;
1291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001293 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294 break;
1295
1296 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001297 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001298 (s[2] & 0xc0) != 0x80) {
1299 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001300 startinpos = s-starts;
1301 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001302 goto utf8Error;
1303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001305 if (ch < 0x0800) {
1306 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001307 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001308
1309 XXX For wide builds (UCS-4) we should probably try
1310 to recombine the surrogates into a single code
1311 unit.
1312 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001313 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001314 startinpos = s-starts;
1315 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 goto utf8Error;
1317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001319 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001320 break;
1321
1322 case 4:
1323 if ((s[1] & 0xc0) != 0x80 ||
1324 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001325 (s[3] & 0xc0) != 0x80) {
1326 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001327 startinpos = s-starts;
1328 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001329 goto utf8Error;
1330 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001331 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1332 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1333 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001334 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001335 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001336 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001337 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001338 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001339 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001340 startinpos = s-starts;
1341 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001342 goto utf8Error;
1343 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001344#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001345 *p++ = (Py_UNICODE)ch;
1346#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001347 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001348
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 /* translate from 10000..10FFFF to 0..FFFF */
1350 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001351
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001352 /* high surrogate = top 10 bits added to D800 */
1353 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001354
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001355 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001356 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001357#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 break;
1359
1360 default:
1361 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001362 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363 startinpos = s-starts;
1364 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001365 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 }
1367 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001368 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001369
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001370 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371 outpos = p-PyUnicode_AS_UNICODE(unicode);
1372 if (unicode_decode_call_errorhandler(
1373 errors, &errorHandler,
1374 "utf8", errmsg,
1375 starts, size, &startinpos, &endinpos, &exc, &s,
1376 (PyObject **)&unicode, &outpos, &p))
1377 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378 }
Walter Dörwald69652032004-09-07 20:24:22 +00001379 if (consumed)
1380 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381
1382 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001383 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 goto onError;
1385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001386 Py_XDECREF(errorHandler);
1387 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001388 return (PyObject *)unicode;
1389
1390onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391 Py_XDECREF(errorHandler);
1392 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 Py_DECREF(unicode);
1394 return NULL;
1395}
1396
Tim Peters602f7402002-04-27 18:03:26 +00001397/* Allocation strategy: if the string is short, convert into a stack buffer
1398 and allocate exactly as much space needed at the end. Else allocate the
1399 maximum possible needed (4 result bytes per Unicode character), and return
1400 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001401*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001402PyObject *
1403PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001404 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00001405 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406{
Tim Peters602f7402002-04-27 18:03:26 +00001407#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001408
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001410 PyObject *v; /* result string object */
1411 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001412 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001413 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001414 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001415
Tim Peters602f7402002-04-27 18:03:26 +00001416 assert(s != NULL);
1417 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418
Tim Peters602f7402002-04-27 18:03:26 +00001419 if (size <= MAX_SHORT_UNICHARS) {
1420 /* Write into the stack buffer; nallocated can't overflow.
1421 * At the end, we'll allocate exactly as much heap space as it
1422 * turns out we need.
1423 */
1424 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1425 v = NULL; /* will allocate after we're done */
1426 p = stackbuf;
1427 }
1428 else {
1429 /* Overallocate on the heap, and give the excess back at the end. */
1430 nallocated = size * 4;
1431 if (nallocated / 4 != size) /* overflow! */
1432 return PyErr_NoMemory();
1433 v = PyString_FromStringAndSize(NULL, nallocated);
1434 if (v == NULL)
1435 return NULL;
1436 p = PyString_AS_STRING(v);
1437 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001438
Tim Peters602f7402002-04-27 18:03:26 +00001439 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001440 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001441
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001442 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001443 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001445
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001447 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001448 *p++ = (char)(0xc0 | (ch >> 6));
1449 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001450 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001451 else {
Tim Peters602f7402002-04-27 18:03:26 +00001452 /* Encode UCS2 Unicode ordinals */
1453 if (ch < 0x10000) {
1454 /* Special case: check for high surrogate */
1455 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1456 Py_UCS4 ch2 = s[i];
1457 /* Check for low surrogate and combine the two to
1458 form a UCS4 value */
1459 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001460 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001461 i++;
1462 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001463 }
Tim Peters602f7402002-04-27 18:03:26 +00001464 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001465 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001466 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001467 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1468 *p++ = (char)(0x80 | (ch & 0x3f));
1469 continue;
1470 }
1471encodeUCS4:
1472 /* Encode UCS4 Unicode ordinals */
1473 *p++ = (char)(0xf0 | (ch >> 18));
1474 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1475 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1476 *p++ = (char)(0x80 | (ch & 0x3f));
1477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001479
Tim Peters602f7402002-04-27 18:03:26 +00001480 if (v == NULL) {
1481 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001482 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00001483 assert(nneeded <= nallocated);
1484 v = PyString_FromStringAndSize(stackbuf, nneeded);
1485 }
1486 else {
1487 /* Cut back to size actually needed. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001488 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00001489 assert(nneeded <= nallocated);
1490 _PyString_Resize(&v, nneeded);
1491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001492 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001493
Tim Peters602f7402002-04-27 18:03:26 +00001494#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495}
1496
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1498{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499 if (!PyUnicode_Check(unicode)) {
1500 PyErr_BadArgument();
1501 return NULL;
1502 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001503 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1504 PyUnicode_GET_SIZE(unicode),
1505 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Walter Dörwald6e390802007-08-17 16:41:28 +00001508/* --- UTF-32 Codec ------------------------------------------------------- */
1509
1510PyObject *
1511PyUnicode_DecodeUTF32(const char *s,
1512 Py_ssize_t size,
1513 const char *errors,
1514 int *byteorder)
1515{
1516 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1517}
1518
1519PyObject *
1520PyUnicode_DecodeUTF32Stateful(const char *s,
1521 Py_ssize_t size,
1522 const char *errors,
1523 int *byteorder,
1524 Py_ssize_t *consumed)
1525{
1526 const char *starts = s;
1527 Py_ssize_t startinpos;
1528 Py_ssize_t endinpos;
1529 Py_ssize_t outpos;
1530 PyUnicodeObject *unicode;
1531 Py_UNICODE *p;
1532#ifndef Py_UNICODE_WIDE
1533 int i, pairs;
1534#else
1535 const int pairs = 0;
1536#endif
1537 const unsigned char *q, *e;
1538 int bo = 0; /* assume native ordering by default */
1539 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00001540 /* Offsets from q for retrieving bytes in the right order. */
1541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1542 int iorder[] = {0, 1, 2, 3};
1543#else
1544 int iorder[] = {3, 2, 1, 0};
1545#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00001546 PyObject *errorHandler = NULL;
1547 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00001548 /* On narrow builds we split characters outside the BMP into two
1549 codepoints => count how much extra space we need. */
1550#ifndef Py_UNICODE_WIDE
1551 for (i = pairs = 0; i < size/4; i++)
1552 if (((Py_UCS4 *)s)[i] >= 0x10000)
1553 pairs++;
1554#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00001555
1556 /* This might be one to much, because of a BOM */
1557 unicode = _PyUnicode_New((size+3)/4+pairs);
1558 if (!unicode)
1559 return NULL;
1560 if (size == 0)
1561 return (PyObject *)unicode;
1562
1563 /* Unpack UTF-32 encoded data */
1564 p = unicode->str;
1565 q = (unsigned char *)s;
1566 e = q + size;
1567
1568 if (byteorder)
1569 bo = *byteorder;
1570
1571 /* Check for BOM marks (U+FEFF) in the input and adjust current
1572 byte order setting accordingly. In native mode, the leading BOM
1573 mark is skipped, in all other modes, it is copied to the output
1574 stream as-is (giving a ZWNBSP character). */
1575 if (bo == 0) {
1576 if (size >= 4) {
1577 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1578 (q[iorder[1]] << 8) | q[iorder[0]];
1579#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1580 if (bom == 0x0000FEFF) {
1581 q += 4;
1582 bo = -1;
1583 }
1584 else if (bom == 0xFFFE0000) {
1585 q += 4;
1586 bo = 1;
1587 }
1588#else
1589 if (bom == 0x0000FEFF) {
1590 q += 4;
1591 bo = 1;
1592 }
1593 else if (bom == 0xFFFE0000) {
1594 q += 4;
1595 bo = -1;
1596 }
1597#endif
1598 }
1599 }
1600
1601 if (bo == -1) {
1602 /* force LE */
1603 iorder[0] = 0;
1604 iorder[1] = 1;
1605 iorder[2] = 2;
1606 iorder[3] = 3;
1607 }
1608 else if (bo == 1) {
1609 /* force BE */
1610 iorder[0] = 3;
1611 iorder[1] = 2;
1612 iorder[2] = 1;
1613 iorder[3] = 0;
1614 }
1615
1616 while (q < e) {
1617 Py_UCS4 ch;
1618 /* remaining bytes at the end? (size should be divisible by 4) */
1619 if (e-q<4) {
1620 if (consumed)
1621 break;
1622 errmsg = "truncated data";
1623 startinpos = ((const char *)q)-starts;
1624 endinpos = ((const char *)e)-starts;
1625 goto utf32Error;
1626 /* The remaining input chars are ignored if the callback
1627 chooses to skip the input */
1628 }
1629 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1630 (q[iorder[1]] << 8) | q[iorder[0]];
1631
1632 if (ch >= 0x110000)
1633 {
1634 errmsg = "codepoint not in range(0x110000)";
1635 startinpos = ((const char *)q)-starts;
1636 endinpos = startinpos+4;
1637 goto utf32Error;
1638 }
1639#ifndef Py_UNICODE_WIDE
1640 if (ch >= 0x10000)
1641 {
1642 *p++ = 0xD800 | ((ch-0x10000) >> 10);
1643 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
1644 }
1645 else
1646#endif
1647 *p++ = ch;
1648 q += 4;
1649 continue;
1650 utf32Error:
1651 outpos = p-PyUnicode_AS_UNICODE(unicode);
1652 if (unicode_decode_call_errorhandler(
1653 errors, &errorHandler,
1654 "utf32", errmsg,
1655 starts, size, &startinpos, &endinpos, &exc, &s,
1656 (PyObject **)&unicode, &outpos, &p))
1657 goto onError;
1658 }
1659
1660 if (byteorder)
1661 *byteorder = bo;
1662
1663 if (consumed)
1664 *consumed = (const char *)q-starts;
1665
1666 /* Adjust length */
1667 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1668 goto onError;
1669
1670 Py_XDECREF(errorHandler);
1671 Py_XDECREF(exc);
1672 return (PyObject *)unicode;
1673
1674onError:
1675 Py_DECREF(unicode);
1676 Py_XDECREF(errorHandler);
1677 Py_XDECREF(exc);
1678 return NULL;
1679}
1680
1681PyObject *
1682PyUnicode_EncodeUTF32(const Py_UNICODE *s,
1683 Py_ssize_t size,
1684 const char *errors,
1685 int byteorder)
1686{
1687 PyObject *v;
1688 unsigned char *p;
1689#ifndef Py_UNICODE_WIDE
1690 int i, pairs;
1691#else
1692 const int pairs = 0;
1693#endif
1694 /* Offsets from p for storing byte pairs in the right order. */
1695#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1696 int iorder[] = {0, 1, 2, 3};
1697#else
1698 int iorder[] = {3, 2, 1, 0};
1699#endif
1700
1701#define STORECHAR(CH) \
1702 do { \
1703 p[iorder[3]] = ((CH) >> 24) & 0xff; \
1704 p[iorder[2]] = ((CH) >> 16) & 0xff; \
1705 p[iorder[1]] = ((CH) >> 8) & 0xff; \
1706 p[iorder[0]] = (CH) & 0xff; \
1707 p += 4; \
1708 } while(0)
1709
1710 /* In narrow builds we can output surrogate pairs as one codepoint,
1711 so we need less space. */
1712#ifndef Py_UNICODE_WIDE
1713 for (i = pairs = 0; i < size-1; i++)
1714 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
1715 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
1716 pairs++;
1717#endif
1718 v = PyString_FromStringAndSize(NULL,
1719 4 * (size - pairs + (byteorder == 0)));
1720 if (v == NULL)
1721 return NULL;
1722
1723 p = (unsigned char *)PyString_AS_STRING(v);
1724 if (byteorder == 0)
1725 STORECHAR(0xFEFF);
1726 if (size == 0)
1727 return v;
1728
1729 if (byteorder == -1) {
1730 /* force LE */
1731 iorder[0] = 0;
1732 iorder[1] = 1;
1733 iorder[2] = 2;
1734 iorder[3] = 3;
1735 }
1736 else if (byteorder == 1) {
1737 /* force BE */
1738 iorder[0] = 3;
1739 iorder[1] = 2;
1740 iorder[2] = 1;
1741 iorder[3] = 0;
1742 }
1743
1744 while (size-- > 0) {
1745 Py_UCS4 ch = *s++;
1746#ifndef Py_UNICODE_WIDE
1747 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
1748 Py_UCS4 ch2 = *s;
1749 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1750 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1751 s++;
1752 size--;
1753 }
1754 }
1755#endif
1756 STORECHAR(ch);
1757 }
1758 return v;
1759#undef STORECHAR
1760}
1761
1762PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
1763{
1764 if (!PyUnicode_Check(unicode)) {
1765 PyErr_BadArgument();
1766 return NULL;
1767 }
1768 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
1769 PyUnicode_GET_SIZE(unicode),
1770 NULL,
1771 0);
1772}
1773
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774/* --- UTF-16 Codec ------------------------------------------------------- */
1775
Tim Peters772747b2001-08-09 22:21:55 +00001776PyObject *
1777PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001778 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001779 const char *errors,
1780 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781{
Walter Dörwald69652032004-09-07 20:24:22 +00001782 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1783}
1784
1785PyObject *
1786PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001787 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00001788 const char *errors,
1789 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001790 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001793 Py_ssize_t startinpos;
1794 Py_ssize_t endinpos;
1795 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 PyUnicodeObject *unicode;
1797 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001798 const unsigned char *q, *e;
1799 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001800 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001801 /* Offsets from q for retrieving byte pairs in the right order. */
1802#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1803 int ihi = 1, ilo = 0;
1804#else
1805 int ihi = 0, ilo = 1;
1806#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001807 PyObject *errorHandler = NULL;
1808 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809
1810 /* Note: size will always be longer than the resulting Unicode
1811 character count */
1812 unicode = _PyUnicode_New(size);
1813 if (!unicode)
1814 return NULL;
1815 if (size == 0)
1816 return (PyObject *)unicode;
1817
1818 /* Unpack UTF-16 encoded data */
1819 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001820 q = (unsigned char *)s;
1821 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822
1823 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001824 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001826 /* Check for BOM marks (U+FEFF) in the input and adjust current
1827 byte order setting accordingly. In native mode, the leading BOM
1828 mark is skipped, in all other modes, it is copied to the output
1829 stream as-is (giving a ZWNBSP character). */
1830 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001831 if (size >= 2) {
1832 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001833#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001834 if (bom == 0xFEFF) {
1835 q += 2;
1836 bo = -1;
1837 }
1838 else if (bom == 0xFFFE) {
1839 q += 2;
1840 bo = 1;
1841 }
Tim Petersced69f82003-09-16 20:30:58 +00001842#else
Walter Dörwald69652032004-09-07 20:24:22 +00001843 if (bom == 0xFEFF) {
1844 q += 2;
1845 bo = 1;
1846 }
1847 else if (bom == 0xFFFE) {
1848 q += 2;
1849 bo = -1;
1850 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001851#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001852 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Tim Peters772747b2001-08-09 22:21:55 +00001855 if (bo == -1) {
1856 /* force LE */
1857 ihi = 1;
1858 ilo = 0;
1859 }
1860 else if (bo == 1) {
1861 /* force BE */
1862 ihi = 0;
1863 ilo = 1;
1864 }
1865
1866 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001867 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001868 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001870 if (consumed)
1871 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 errmsg = "truncated data";
1873 startinpos = ((const char *)q)-starts;
1874 endinpos = ((const char *)e)-starts;
1875 goto utf16Error;
1876 /* The remaining input chars are ignored if the callback
1877 chooses to skip the input */
1878 }
1879 ch = (q[ihi] << 8) | q[ilo];
1880
Tim Peters772747b2001-08-09 22:21:55 +00001881 q += 2;
1882
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 if (ch < 0xD800 || ch > 0xDFFF) {
1884 *p++ = ch;
1885 continue;
1886 }
1887
1888 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001889 if (q >= e) {
1890 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 startinpos = (((const char *)q)-2)-starts;
1892 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001893 goto utf16Error;
1894 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001895 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001896 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1897 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001898 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001899#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001900 *p++ = ch;
1901 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001902#else
1903 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001904#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001905 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001906 }
1907 else {
1908 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 startinpos = (((const char *)q)-4)-starts;
1910 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001911 goto utf16Error;
1912 }
1913
Guido van Rossumd57fd912000-03-10 22:53:23 +00001914 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001915 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001916 startinpos = (((const char *)q)-2)-starts;
1917 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001918 /* Fall through to report the error */
1919
1920 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 outpos = p-PyUnicode_AS_UNICODE(unicode);
1922 if (unicode_decode_call_errorhandler(
1923 errors, &errorHandler,
1924 "utf16", errmsg,
1925 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1926 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001927 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 }
1929
1930 if (byteorder)
1931 *byteorder = bo;
1932
Walter Dörwald69652032004-09-07 20:24:22 +00001933 if (consumed)
1934 *consumed = (const char *)q-starts;
1935
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001937 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 goto onError;
1939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 Py_XDECREF(errorHandler);
1941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 return (PyObject *)unicode;
1943
1944onError:
1945 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 return NULL;
1949}
1950
Tim Peters772747b2001-08-09 22:21:55 +00001951PyObject *
1952PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001953 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00001954 const char *errors,
1955 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956{
1957 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001958 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001959#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001960 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001961#else
1962 const int pairs = 0;
1963#endif
Tim Peters772747b2001-08-09 22:21:55 +00001964 /* Offsets from p for storing byte pairs in the right order. */
1965#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1966 int ihi = 1, ilo = 0;
1967#else
1968 int ihi = 0, ilo = 1;
1969#endif
1970
1971#define STORECHAR(CH) \
1972 do { \
1973 p[ihi] = ((CH) >> 8) & 0xff; \
1974 p[ilo] = (CH) & 0xff; \
1975 p += 2; \
1976 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001978#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001979 for (i = pairs = 0; i < size; i++)
1980 if (s[i] >= 0x10000)
1981 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001982#endif
Tim Petersced69f82003-09-16 20:30:58 +00001983 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001984 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 if (v == NULL)
1986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987
Tim Peters772747b2001-08-09 22:21:55 +00001988 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001990 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001991 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001992 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001993
1994 if (byteorder == -1) {
1995 /* force LE */
1996 ihi = 1;
1997 ilo = 0;
1998 }
1999 else if (byteorder == 1) {
2000 /* force BE */
2001 ihi = 0;
2002 ilo = 1;
2003 }
2004
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002005 while (size-- > 0) {
2006 Py_UNICODE ch = *s++;
2007 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002008#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002009 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002010 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2011 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002013#endif
Tim Peters772747b2001-08-09 22:21:55 +00002014 STORECHAR(ch);
2015 if (ch2)
2016 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002019#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020}
2021
2022PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2023{
2024 if (!PyUnicode_Check(unicode)) {
2025 PyErr_BadArgument();
2026 return NULL;
2027 }
2028 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2029 PyUnicode_GET_SIZE(unicode),
2030 NULL,
2031 0);
2032}
2033
2034/* --- Unicode Escape Codec ----------------------------------------------- */
2035
Fredrik Lundh06d12682001-01-24 07:59:11 +00002036static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002037
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002039 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 const char *errors)
2041{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002043 Py_ssize_t startinpos;
2044 Py_ssize_t endinpos;
2045 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002046 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002050 char* message;
2051 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002052 PyObject *errorHandler = NULL;
2053 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002054
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 /* Escaped strings will always be longer than the resulting
2056 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 length after conversion to the true value.
2058 (but if the error callback returns a long replacement string
2059 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 v = _PyUnicode_New(size);
2061 if (v == NULL)
2062 goto onError;
2063 if (size == 0)
2064 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002068
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 while (s < end) {
2070 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002071 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073
2074 /* Non-escape characters are interpreted as Unicode ordinals */
2075 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002076 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 continue;
2078 }
2079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002080 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 /* \ - Escapes */
2082 s++;
2083 switch (*s++) {
2084
2085 /* \x escapes */
2086 case '\n': break;
2087 case '\\': *p++ = '\\'; break;
2088 case '\'': *p++ = '\''; break;
2089 case '\"': *p++ = '\"'; break;
2090 case 'b': *p++ = '\b'; break;
2091 case 'f': *p++ = '\014'; break; /* FF */
2092 case 't': *p++ = '\t'; break;
2093 case 'n': *p++ = '\n'; break;
2094 case 'r': *p++ = '\r'; break;
2095 case 'v': *p++ = '\013'; break; /* VT */
2096 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2097
2098 /* \OOO (octal) escapes */
2099 case '0': case '1': case '2': case '3':
2100 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002101 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002103 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002105 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002106 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002107 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108 break;
2109
Fredrik Lundhccc74732001-02-18 22:13:49 +00002110 /* hex escapes */
2111 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002113 digits = 2;
2114 message = "truncated \\xXX escape";
2115 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116
Fredrik Lundhccc74732001-02-18 22:13:49 +00002117 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002119 digits = 4;
2120 message = "truncated \\uXXXX escape";
2121 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122
Fredrik Lundhccc74732001-02-18 22:13:49 +00002123 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002124 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002125 digits = 8;
2126 message = "truncated \\UXXXXXXXX escape";
2127 hexescape:
2128 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 outpos = p-PyUnicode_AS_UNICODE(v);
2130 if (s+digits>end) {
2131 endinpos = size;
2132 if (unicode_decode_call_errorhandler(
2133 errors, &errorHandler,
2134 "unicodeescape", "end of string in escape sequence",
2135 starts, size, &startinpos, &endinpos, &exc, &s,
2136 (PyObject **)&v, &outpos, &p))
2137 goto onError;
2138 goto nextByte;
2139 }
2140 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002141 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002142 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002143 endinpos = (s+i+1)-starts;
2144 if (unicode_decode_call_errorhandler(
2145 errors, &errorHandler,
2146 "unicodeescape", message,
2147 starts, size, &startinpos, &endinpos, &exc, &s,
2148 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002149 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002151 }
2152 chr = (chr<<4) & ~0xF;
2153 if (c >= '0' && c <= '9')
2154 chr += c - '0';
2155 else if (c >= 'a' && c <= 'f')
2156 chr += 10 + c - 'a';
2157 else
2158 chr += 10 + c - 'A';
2159 }
2160 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002161 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 /* _decoding_error will have already written into the
2163 target buffer. */
2164 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002165 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002166 /* when we get here, chr is a 32-bit unicode character */
2167 if (chr <= 0xffff)
2168 /* UCS-2 character */
2169 *p++ = (Py_UNICODE) chr;
2170 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002171 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002172 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002173#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002174 *p++ = chr;
2175#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002176 chr -= 0x10000L;
2177 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002178 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002179#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002180 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 endinpos = s-starts;
2182 outpos = p-PyUnicode_AS_UNICODE(v);
2183 if (unicode_decode_call_errorhandler(
2184 errors, &errorHandler,
2185 "unicodeescape", "illegal Unicode character",
2186 starts, size, &startinpos, &endinpos, &exc, &s,
2187 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002188 goto onError;
2189 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002190 break;
2191
2192 /* \N{name} */
2193 case 'N':
2194 message = "malformed \\N character escape";
2195 if (ucnhash_CAPI == NULL) {
2196 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002197 PyObject *m, *api;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002198 m = PyImport_ImportModule("unicodedata");
2199 if (m == NULL)
2200 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002201 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002202 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002203 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002204 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002205 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002206 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002207 if (ucnhash_CAPI == NULL)
2208 goto ucnhashError;
2209 }
2210 if (*s == '{') {
2211 const char *start = s+1;
2212 /* look for the closing brace */
2213 while (*s != '}' && s < end)
2214 s++;
2215 if (s > start && s < end && *s == '}') {
2216 /* found a name. look it up in the unicode database */
2217 message = "unknown Unicode character name";
2218 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002219 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002220 goto store;
2221 }
2222 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002223 endinpos = s-starts;
2224 outpos = p-PyUnicode_AS_UNICODE(v);
2225 if (unicode_decode_call_errorhandler(
2226 errors, &errorHandler,
2227 "unicodeescape", message,
2228 starts, size, &startinpos, &endinpos, &exc, &s,
2229 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002230 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002231 break;
2232
2233 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002234 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 message = "\\ at end of string";
2236 s--;
2237 endinpos = s-starts;
2238 outpos = p-PyUnicode_AS_UNICODE(v);
2239 if (unicode_decode_call_errorhandler(
2240 errors, &errorHandler,
2241 "unicodeescape", message,
2242 starts, size, &startinpos, &endinpos, &exc, &s,
2243 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002244 goto onError;
2245 }
2246 else {
2247 *p++ = '\\';
2248 *p++ = (unsigned char)s[-1];
2249 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002250 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002252 nextByte:
2253 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002256 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002257 Py_XDECREF(errorHandler);
2258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002260
Fredrik Lundhccc74732001-02-18 22:13:49 +00002261ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002262 PyErr_SetString(
2263 PyExc_UnicodeError,
2264 "\\N escapes not supported (can't load unicodedata module)"
2265 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002266 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002267 Py_XDECREF(errorHandler);
2268 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002269 return NULL;
2270
Fredrik Lundhccc74732001-02-18 22:13:49 +00002271onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002273 Py_XDECREF(errorHandler);
2274 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275 return NULL;
2276}
2277
2278/* Return a Unicode-Escape string version of the Unicode object.
2279
2280 If quotes is true, the string is enclosed in u"" or u'' quotes as
2281 appropriate.
2282
2283*/
2284
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002285Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Fredrik Lundh95e2a912006-05-26 11:38:15 +00002286 Py_ssize_t size,
2287 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002288{
2289 /* like wcschr, but doesn't stop at NULL characters */
2290
2291 while (size-- > 0) {
2292 if (*s == ch)
2293 return s;
2294 s++;
2295 }
2296
2297 return NULL;
2298}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002299
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300static
2301PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002302 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 int quotes)
2304{
2305 PyObject *repr;
2306 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002308 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309
Neal Norwitz17753ec2006-08-21 22:21:19 +00002310 /* XXX(nnorwitz): rather than over-allocating, it would be
2311 better to choose a different scheme. Perhaps scan the
2312 first N-chars of the string and allocate based on that size.
2313 */
2314 /* Initial allocation is based on the longest-possible unichr
2315 escape.
2316
2317 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2318 unichr, so in this case it's the longest unichr escape. In
2319 narrow (UTF-16) builds this is five chars per source unichr
2320 since there are two unichrs in the surrogate pair, so in narrow
2321 (UTF-16) builds it's not the longest unichr escape.
2322
2323 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2324 so in the narrow (UTF-16) build case it's the longest unichr
2325 escape.
2326 */
2327
2328 repr = PyString_FromStringAndSize(NULL,
2329 2
2330#ifdef Py_UNICODE_WIDE
2331 + 10*size
2332#else
2333 + 6*size
2334#endif
2335 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002336 if (repr == NULL)
2337 return NULL;
2338
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002339 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340
2341 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002343 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344 !findchar(s, size, '"')) ? '"' : '\'';
2345 }
2346 while (size-- > 0) {
2347 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002348
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002349 /* Escape quotes and backslashes */
2350 if ((quotes &&
2351 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 *p++ = '\\';
2353 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00002354 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002355 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002356
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002357#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002358 /* Map 21-bit characters to '\U00xxxxxx' */
2359 else if (ch >= 0x10000) {
2360 *p++ = '\\';
2361 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002362 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2363 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2364 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2365 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2366 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2367 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2368 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002369 *p++ = hexdigit[ch & 0x0000000F];
2370 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002371 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002372#else
2373 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002374 else if (ch >= 0xD800 && ch < 0xDC00) {
2375 Py_UNICODE ch2;
2376 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002377
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002378 ch2 = *s++;
2379 size--;
2380 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2381 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2382 *p++ = '\\';
2383 *p++ = 'U';
2384 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2385 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2386 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2387 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2388 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2389 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2390 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2391 *p++ = hexdigit[ucs & 0x0000000F];
2392 continue;
2393 }
2394 /* Fall through: isolated surrogates are copied as-is */
2395 s--;
2396 size++;
2397 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002398#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002399
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002401 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 *p++ = '\\';
2403 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002404 *p++ = hexdigit[(ch >> 12) & 0x000F];
2405 *p++ = hexdigit[(ch >> 8) & 0x000F];
2406 *p++ = hexdigit[(ch >> 4) & 0x000F];
2407 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002409
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002410 /* Map special whitespace to '\t', \n', '\r' */
2411 else if (ch == '\t') {
2412 *p++ = '\\';
2413 *p++ = 't';
2414 }
2415 else if (ch == '\n') {
2416 *p++ = '\\';
2417 *p++ = 'n';
2418 }
2419 else if (ch == '\r') {
2420 *p++ = '\\';
2421 *p++ = 'r';
2422 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002423
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002424 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002425 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002427 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002428 *p++ = hexdigit[(ch >> 4) & 0x000F];
2429 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002430 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002431
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432 /* Copy everything else as-is */
2433 else
2434 *p++ = (char) ch;
2435 }
2436 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002437 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438
2439 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002440 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 return repr;
2442}
2443
2444PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002445 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002446{
2447 return unicodeescape_string(s, size, 0);
2448}
2449
2450PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2451{
2452 if (!PyUnicode_Check(unicode)) {
2453 PyErr_BadArgument();
2454 return NULL;
2455 }
2456 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2457 PyUnicode_GET_SIZE(unicode));
2458}
2459
2460/* --- Raw Unicode Escape Codec ------------------------------------------- */
2461
2462PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002463 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 const char *errors)
2465{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002466 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002467 Py_ssize_t startinpos;
2468 Py_ssize_t endinpos;
2469 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002471 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 const char *end;
2473 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002474 PyObject *errorHandler = NULL;
2475 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002476
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477 /* Escaped strings will always be longer than the resulting
2478 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 length after conversion to the true value. (But decoding error
2480 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 v = _PyUnicode_New(size);
2482 if (v == NULL)
2483 goto onError;
2484 if (size == 0)
2485 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487 end = s + size;
2488 while (s < end) {
2489 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002490 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002492 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
2494 /* Non-escape characters are interpreted as Unicode ordinals */
2495 if (*s != '\\') {
2496 *p++ = (unsigned char)*s++;
2497 continue;
2498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500
2501 /* \u-escapes are only interpreted iff the number of leading
2502 backslashes if odd */
2503 bs = s;
2504 for (;s < end;) {
2505 if (*s != '\\')
2506 break;
2507 *p++ = (unsigned char)*s++;
2508 }
2509 if (((s - bs) & 1) == 0 ||
2510 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002511 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 continue;
2513 }
2514 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002515 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 s++;
2517
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002518 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002519 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002520 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002523 endinpos = s-starts;
2524 if (unicode_decode_call_errorhandler(
2525 errors, &errorHandler,
2526 "rawunicodeescape", "truncated \\uXXXX",
2527 starts, size, &startinpos, &endinpos, &exc, &s,
2528 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002530 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 }
2532 x = (x<<4) & ~0xF;
2533 if (c >= '0' && c <= '9')
2534 x += c - '0';
2535 else if (c >= 'a' && c <= 'f')
2536 x += 10 + c - 'a';
2537 else
2538 x += 10 + c - 'A';
2539 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002540#ifndef Py_UNICODE_WIDE
2541 if (x > 0x10000) {
2542 if (unicode_decode_call_errorhandler(
2543 errors, &errorHandler,
2544 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2545 starts, size, &startinpos, &endinpos, &exc, &s,
2546 (PyObject **)&v, &outpos, &p))
2547 goto onError;
2548 }
2549#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002550 *p++ = x;
2551 nextByte:
2552 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002554 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002555 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002556 Py_XDECREF(errorHandler);
2557 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002559
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 onError:
2561 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002562 Py_XDECREF(errorHandler);
2563 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 return NULL;
2565}
2566
2567PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002568 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569{
2570 PyObject *repr;
2571 char *p;
2572 char *q;
2573
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002574 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002576#ifdef Py_UNICODE_WIDE
2577 repr = PyString_FromStringAndSize(NULL, 10 * size);
2578#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002580#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 if (repr == NULL)
2582 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002583 if (size == 0)
2584 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585
2586 p = q = PyString_AS_STRING(repr);
2587 while (size-- > 0) {
2588 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002589#ifdef Py_UNICODE_WIDE
2590 /* Map 32-bit characters to '\Uxxxxxxxx' */
2591 if (ch >= 0x10000) {
2592 *p++ = '\\';
2593 *p++ = 'U';
2594 *p++ = hexdigit[(ch >> 28) & 0xf];
2595 *p++ = hexdigit[(ch >> 24) & 0xf];
2596 *p++ = hexdigit[(ch >> 20) & 0xf];
2597 *p++ = hexdigit[(ch >> 16) & 0xf];
2598 *p++ = hexdigit[(ch >> 12) & 0xf];
2599 *p++ = hexdigit[(ch >> 8) & 0xf];
2600 *p++ = hexdigit[(ch >> 4) & 0xf];
2601 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002602 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002603 else
2604#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 /* Map 16-bit characters to '\uxxxx' */
2606 if (ch >= 256) {
2607 *p++ = '\\';
2608 *p++ = 'u';
2609 *p++ = hexdigit[(ch >> 12) & 0xf];
2610 *p++ = hexdigit[(ch >> 8) & 0xf];
2611 *p++ = hexdigit[(ch >> 4) & 0xf];
2612 *p++ = hexdigit[ch & 15];
2613 }
2614 /* Copy everything else as-is */
2615 else
2616 *p++ = (char) ch;
2617 }
2618 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002619 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 return repr;
2621}
2622
2623PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2624{
2625 if (!PyUnicode_Check(unicode)) {
2626 PyErr_BadArgument();
2627 return NULL;
2628 }
2629 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2630 PyUnicode_GET_SIZE(unicode));
2631}
2632
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002633/* --- Unicode Internal Codec ------------------------------------------- */
2634
2635PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002636 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002637 const char *errors)
2638{
2639 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002640 Py_ssize_t startinpos;
2641 Py_ssize_t endinpos;
2642 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002643 PyUnicodeObject *v;
2644 Py_UNICODE *p;
2645 const char *end;
2646 const char *reason;
2647 PyObject *errorHandler = NULL;
2648 PyObject *exc = NULL;
2649
Neal Norwitzd43069c2006-01-08 01:12:10 +00002650#ifdef Py_UNICODE_WIDE
2651 Py_UNICODE unimax = PyUnicode_GetMax();
2652#endif
2653
Armin Rigo7ccbca92006-10-04 12:17:45 +00002654 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002655 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2656 if (v == NULL)
2657 goto onError;
2658 if (PyUnicode_GetSize((PyObject *)v) == 0)
2659 return (PyObject *)v;
2660 p = PyUnicode_AS_UNICODE(v);
2661 end = s + size;
2662
2663 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00002664 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002665 /* We have to sanity check the raw data, otherwise doom looms for
2666 some malformed UCS-4 data. */
2667 if (
2668 #ifdef Py_UNICODE_WIDE
2669 *p > unimax || *p < 0 ||
2670 #endif
2671 end-s < Py_UNICODE_SIZE
2672 )
2673 {
2674 startinpos = s - starts;
2675 if (end-s < Py_UNICODE_SIZE) {
2676 endinpos = end-starts;
2677 reason = "truncated input";
2678 }
2679 else {
2680 endinpos = s - starts + Py_UNICODE_SIZE;
2681 reason = "illegal code point (> 0x10FFFF)";
2682 }
2683 outpos = p - PyUnicode_AS_UNICODE(v);
2684 if (unicode_decode_call_errorhandler(
2685 errors, &errorHandler,
2686 "unicode_internal", reason,
2687 starts, size, &startinpos, &endinpos, &exc, &s,
2688 (PyObject **)&v, &outpos, &p)) {
2689 goto onError;
2690 }
2691 }
2692 else {
2693 p++;
2694 s += Py_UNICODE_SIZE;
2695 }
2696 }
2697
Martin v. Löwis412fb672006-04-13 06:34:32 +00002698 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002699 goto onError;
2700 Py_XDECREF(errorHandler);
2701 Py_XDECREF(exc);
2702 return (PyObject *)v;
2703
2704 onError:
2705 Py_XDECREF(v);
2706 Py_XDECREF(errorHandler);
2707 Py_XDECREF(exc);
2708 return NULL;
2709}
2710
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711/* --- Latin-1 Codec ------------------------------------------------------ */
2712
2713PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002714 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002715 const char *errors)
2716{
2717 PyUnicodeObject *v;
2718 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002719
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002721 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002722 Py_UNICODE r = *(unsigned char*)s;
2723 return PyUnicode_FromUnicode(&r, 1);
2724 }
2725
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 v = _PyUnicode_New(size);
2727 if (v == NULL)
2728 goto onError;
2729 if (size == 0)
2730 return (PyObject *)v;
2731 p = PyUnicode_AS_UNICODE(v);
2732 while (size-- > 0)
2733 *p++ = (unsigned char)*s++;
2734 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 onError:
2737 Py_XDECREF(v);
2738 return NULL;
2739}
2740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741/* create or adjust a UnicodeEncodeError */
2742static void make_encode_exception(PyObject **exceptionObject,
2743 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002744 const Py_UNICODE *unicode, Py_ssize_t size,
2745 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 if (*exceptionObject == NULL) {
2749 *exceptionObject = PyUnicodeEncodeError_Create(
2750 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2754 goto onError;
2755 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2756 goto onError;
2757 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2758 goto onError;
2759 return;
2760 onError:
2761 Py_DECREF(*exceptionObject);
2762 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 }
2764}
2765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766/* raises a UnicodeEncodeError */
2767static void raise_encode_exception(PyObject **exceptionObject,
2768 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002769 const Py_UNICODE *unicode, Py_ssize_t size,
2770 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002771 const char *reason)
2772{
2773 make_encode_exception(exceptionObject,
2774 encoding, unicode, size, startpos, endpos, reason);
2775 if (*exceptionObject != NULL)
2776 PyCodec_StrictErrors(*exceptionObject);
2777}
2778
2779/* error handling callback helper:
2780 build arguments, call the callback and check the arguments,
2781 put the result into newpos and return the replacement string, which
2782 has to be freed by the caller */
2783static PyObject *unicode_encode_call_errorhandler(const char *errors,
2784 PyObject **errorHandler,
2785 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002786 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2787 Py_ssize_t startpos, Py_ssize_t endpos,
2788 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002789{
Martin v. Löwis18e16552006-02-15 17:27:45 +00002790 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791
2792 PyObject *restuple;
2793 PyObject *resunicode;
2794
2795 if (*errorHandler == NULL) {
2796 *errorHandler = PyCodec_LookupError(errors);
2797 if (*errorHandler == NULL)
2798 return NULL;
2799 }
2800
2801 make_encode_exception(exceptionObject,
2802 encoding, unicode, size, startpos, endpos, reason);
2803 if (*exceptionObject == NULL)
2804 return NULL;
2805
2806 restuple = PyObject_CallFunctionObjArgs(
2807 *errorHandler, *exceptionObject, NULL);
2808 if (restuple == NULL)
2809 return NULL;
2810 if (!PyTuple_Check(restuple)) {
2811 PyErr_Format(PyExc_TypeError, &argparse[4]);
2812 Py_DECREF(restuple);
2813 return NULL;
2814 }
2815 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2816 &resunicode, newpos)) {
2817 Py_DECREF(restuple);
2818 return NULL;
2819 }
2820 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002821 *newpos = size+*newpos;
2822 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00002823 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002824 Py_DECREF(restuple);
2825 return NULL;
2826 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 Py_INCREF(resunicode);
2828 Py_DECREF(restuple);
2829 return resunicode;
2830}
2831
2832static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002833 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 const char *errors,
2835 int limit)
2836{
2837 /* output object */
2838 PyObject *res;
2839 /* pointers to the beginning and end+1 of input */
2840 const Py_UNICODE *startp = p;
2841 const Py_UNICODE *endp = p + size;
2842 /* pointer to the beginning of the unencodable characters */
2843 /* const Py_UNICODE *badp = NULL; */
2844 /* pointer into the output */
2845 char *str;
2846 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002847 Py_ssize_t respos = 0;
2848 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00002849 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2850 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 PyObject *errorHandler = NULL;
2852 PyObject *exc = NULL;
2853 /* the following variable is used for caching string comparisons
2854 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2855 int known_errorHandler = -1;
2856
2857 /* allocate enough for a simple encoding without
2858 replacements, if we need more, we'll resize */
2859 res = PyString_FromStringAndSize(NULL, size);
2860 if (res == NULL)
2861 goto onError;
2862 if (size == 0)
2863 return res;
2864 str = PyString_AS_STRING(res);
2865 ressize = size;
2866
2867 while (p<endp) {
2868 Py_UNICODE c = *p;
2869
2870 /* can we encode this? */
2871 if (c<limit) {
2872 /* no overflow check, because we know that the space is enough */
2873 *str++ = (char)c;
2874 ++p;
2875 }
2876 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00002877 Py_ssize_t unicodepos = p-startp;
2878 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002879 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002880 Py_ssize_t repsize;
2881 Py_ssize_t newpos;
2882 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002883 Py_UNICODE *uni2;
2884 /* startpos for collecting unencodable chars */
2885 const Py_UNICODE *collstart = p;
2886 const Py_UNICODE *collend = p;
2887 /* find all unecodable characters */
2888 while ((collend < endp) && ((*collend)>=limit))
2889 ++collend;
2890 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2891 if (known_errorHandler==-1) {
2892 if ((errors==NULL) || (!strcmp(errors, "strict")))
2893 known_errorHandler = 1;
2894 else if (!strcmp(errors, "replace"))
2895 known_errorHandler = 2;
2896 else if (!strcmp(errors, "ignore"))
2897 known_errorHandler = 3;
2898 else if (!strcmp(errors, "xmlcharrefreplace"))
2899 known_errorHandler = 4;
2900 else
2901 known_errorHandler = 0;
2902 }
2903 switch (known_errorHandler) {
2904 case 1: /* strict */
2905 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2906 goto onError;
2907 case 2: /* replace */
2908 while (collstart++<collend)
2909 *str++ = '?'; /* fall through */
2910 case 3: /* ignore */
2911 p = collend;
2912 break;
2913 case 4: /* xmlcharrefreplace */
2914 respos = str-PyString_AS_STRING(res);
2915 /* determine replacement size (temporarily (mis)uses p) */
2916 for (p = collstart, repsize = 0; p < collend; ++p) {
2917 if (*p<10)
2918 repsize += 2+1+1;
2919 else if (*p<100)
2920 repsize += 2+2+1;
2921 else if (*p<1000)
2922 repsize += 2+3+1;
2923 else if (*p<10000)
2924 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002925#ifndef Py_UNICODE_WIDE
2926 else
2927 repsize += 2+5+1;
2928#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002929 else if (*p<100000)
2930 repsize += 2+5+1;
2931 else if (*p<1000000)
2932 repsize += 2+6+1;
2933 else
2934 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002935#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 }
2937 requiredsize = respos+repsize+(endp-collend);
2938 if (requiredsize > ressize) {
2939 if (requiredsize<2*ressize)
2940 requiredsize = 2*ressize;
2941 if (_PyString_Resize(&res, requiredsize))
2942 goto onError;
2943 str = PyString_AS_STRING(res) + respos;
2944 ressize = requiredsize;
2945 }
2946 /* generate replacement (temporarily (mis)uses p) */
2947 for (p = collstart; p < collend; ++p) {
2948 str += sprintf(str, "&#%d;", (int)*p);
2949 }
2950 p = collend;
2951 break;
2952 default:
2953 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2954 encoding, reason, startp, size, &exc,
2955 collstart-startp, collend-startp, &newpos);
2956 if (repunicode == NULL)
2957 goto onError;
2958 /* need more space? (at least enough for what we
2959 have+the replacement+the rest of the string, so
2960 we won't have to check space for encodable characters) */
2961 respos = str-PyString_AS_STRING(res);
2962 repsize = PyUnicode_GET_SIZE(repunicode);
2963 requiredsize = respos+repsize+(endp-collend);
2964 if (requiredsize > ressize) {
2965 if (requiredsize<2*ressize)
2966 requiredsize = 2*ressize;
2967 if (_PyString_Resize(&res, requiredsize)) {
2968 Py_DECREF(repunicode);
2969 goto onError;
2970 }
2971 str = PyString_AS_STRING(res) + respos;
2972 ressize = requiredsize;
2973 }
2974 /* check if there is anything unencodable in the replacement
2975 and copy it to the output */
2976 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2977 c = *uni2;
2978 if (c >= limit) {
2979 raise_encode_exception(&exc, encoding, startp, size,
2980 unicodepos, unicodepos+1, reason);
2981 Py_DECREF(repunicode);
2982 goto onError;
2983 }
2984 *str = (char)c;
2985 }
2986 p = startp + newpos;
2987 Py_DECREF(repunicode);
2988 }
2989 }
2990 }
2991 /* Resize if we allocated to much */
2992 respos = str-PyString_AS_STRING(res);
2993 if (respos<ressize)
2994 /* If this falls res will be NULL */
2995 _PyString_Resize(&res, respos);
2996 Py_XDECREF(errorHandler);
2997 Py_XDECREF(exc);
2998 return res;
2999
3000 onError:
3001 Py_XDECREF(res);
3002 Py_XDECREF(errorHandler);
3003 Py_XDECREF(exc);
3004 return NULL;
3005}
3006
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003008 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 const char *errors)
3010{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003011 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003012}
3013
3014PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3015{
3016 if (!PyUnicode_Check(unicode)) {
3017 PyErr_BadArgument();
3018 return NULL;
3019 }
3020 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3021 PyUnicode_GET_SIZE(unicode),
3022 NULL);
3023}
3024
3025/* --- 7-bit ASCII Codec -------------------------------------------------- */
3026
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003028 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 const char *errors)
3030{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 PyUnicodeObject *v;
3033 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003034 Py_ssize_t startinpos;
3035 Py_ssize_t endinpos;
3036 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 const char *e;
3038 PyObject *errorHandler = NULL;
3039 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003040
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003042 if (size == 1 && *(unsigned char*)s < 128) {
3043 Py_UNICODE r = *(unsigned char*)s;
3044 return PyUnicode_FromUnicode(&r, 1);
3045 }
Tim Petersced69f82003-09-16 20:30:58 +00003046
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 v = _PyUnicode_New(size);
3048 if (v == NULL)
3049 goto onError;
3050 if (size == 0)
3051 return (PyObject *)v;
3052 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 e = s + size;
3054 while (s < e) {
3055 register unsigned char c = (unsigned char)*s;
3056 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003058 ++s;
3059 }
3060 else {
3061 startinpos = s-starts;
3062 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00003063 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 if (unicode_decode_call_errorhandler(
3065 errors, &errorHandler,
3066 "ascii", "ordinal not in range(128)",
3067 starts, size, &startinpos, &endinpos, &exc, &s,
3068 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003069 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003072 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003073 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075 Py_XDECREF(errorHandler);
3076 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003078
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079 onError:
3080 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003081 Py_XDECREF(errorHandler);
3082 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 return NULL;
3084}
3085
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003087 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 const char *errors)
3089{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003090 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091}
3092
3093PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3094{
3095 if (!PyUnicode_Check(unicode)) {
3096 PyErr_BadArgument();
3097 return NULL;
3098 }
3099 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3100 PyUnicode_GET_SIZE(unicode),
3101 NULL);
3102}
3103
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003104#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003105
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003106/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003107
Martin v. Löwisd8251432006-06-14 05:21:04 +00003108#if SIZEOF_INT < SIZEOF_SSIZE_T
3109#define NEED_RETRY
3110#endif
3111
3112/* XXX This code is limited to "true" double-byte encodings, as
3113 a) it assumes an incomplete character consists of a single byte, and
3114 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3115 encodings, see IsDBCSLeadByteEx documentation. */
3116
3117static int is_dbcs_lead_byte(const char *s, int offset)
3118{
3119 const char *curr = s + offset;
3120
3121 if (IsDBCSLeadByte(*curr)) {
3122 const char *prev = CharPrev(s, curr);
3123 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3124 }
3125 return 0;
3126}
3127
3128/*
3129 * Decode MBCS string into unicode object. If 'final' is set, converts
3130 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3131 */
3132static int decode_mbcs(PyUnicodeObject **v,
3133 const char *s, /* MBCS string */
3134 int size, /* sizeof MBCS string */
3135 int final)
3136{
3137 Py_UNICODE *p;
3138 Py_ssize_t n = 0;
3139 int usize = 0;
3140
3141 assert(size >= 0);
3142
3143 /* Skip trailing lead-byte unless 'final' is set */
3144 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3145 --size;
3146
3147 /* First get the size of the result */
3148 if (size > 0) {
3149 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3150 if (usize == 0) {
3151 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3152 return -1;
3153 }
3154 }
3155
3156 if (*v == NULL) {
3157 /* Create unicode object */
3158 *v = _PyUnicode_New(usize);
3159 if (*v == NULL)
3160 return -1;
3161 }
3162 else {
3163 /* Extend unicode object */
3164 n = PyUnicode_GET_SIZE(*v);
3165 if (_PyUnicode_Resize(v, n + usize) < 0)
3166 return -1;
3167 }
3168
3169 /* Do the conversion */
3170 if (size > 0) {
3171 p = PyUnicode_AS_UNICODE(*v) + n;
3172 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3173 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3174 return -1;
3175 }
3176 }
3177
3178 return size;
3179}
3180
3181PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3182 Py_ssize_t size,
3183 const char *errors,
3184 Py_ssize_t *consumed)
3185{
3186 PyUnicodeObject *v = NULL;
3187 int done;
3188
3189 if (consumed)
3190 *consumed = 0;
3191
3192#ifdef NEED_RETRY
3193 retry:
3194 if (size > INT_MAX)
3195 done = decode_mbcs(&v, s, INT_MAX, 0);
3196 else
3197#endif
3198 done = decode_mbcs(&v, s, (int)size, !consumed);
3199
3200 if (done < 0) {
3201 Py_XDECREF(v);
3202 return NULL;
3203 }
3204
3205 if (consumed)
3206 *consumed += done;
3207
3208#ifdef NEED_RETRY
3209 if (size > INT_MAX) {
3210 s += done;
3211 size -= done;
3212 goto retry;
3213 }
3214#endif
3215
3216 return (PyObject *)v;
3217}
3218
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003219PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003220 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003221 const char *errors)
3222{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003223 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3224}
3225
3226/*
3227 * Convert unicode into string object (MBCS).
3228 * Returns 0 if succeed, -1 otherwise.
3229 */
3230static int encode_mbcs(PyObject **repr,
3231 const Py_UNICODE *p, /* unicode */
3232 int size) /* size of unicode */
3233{
3234 int mbcssize = 0;
3235 Py_ssize_t n = 0;
3236
3237 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003238
3239 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003240 if (size > 0) {
3241 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3242 if (mbcssize == 0) {
3243 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3244 return -1;
3245 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003246 }
3247
Martin v. Löwisd8251432006-06-14 05:21:04 +00003248 if (*repr == NULL) {
3249 /* Create string object */
3250 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3251 if (*repr == NULL)
3252 return -1;
3253 }
3254 else {
3255 /* Extend string object */
3256 n = PyString_Size(*repr);
3257 if (_PyString_Resize(repr, n + mbcssize) < 0)
3258 return -1;
3259 }
3260
3261 /* Do the conversion */
3262 if (size > 0) {
3263 char *s = PyString_AS_STRING(*repr) + n;
3264 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3265 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3266 return -1;
3267 }
3268 }
3269
3270 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003271}
3272
3273PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003274 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003275 const char *errors)
3276{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003277 PyObject *repr = NULL;
3278 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003279
Martin v. Löwisd8251432006-06-14 05:21:04 +00003280#ifdef NEED_RETRY
3281 retry:
3282 if (size > INT_MAX)
3283 ret = encode_mbcs(&repr, p, INT_MAX);
3284 else
3285#endif
3286 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003287
Martin v. Löwisd8251432006-06-14 05:21:04 +00003288 if (ret < 0) {
3289 Py_XDECREF(repr);
3290 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003291 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003292
3293#ifdef NEED_RETRY
3294 if (size > INT_MAX) {
3295 p += INT_MAX;
3296 size -= INT_MAX;
3297 goto retry;
3298 }
3299#endif
3300
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003301 return repr;
3302}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003303
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003304PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3305{
3306 if (!PyUnicode_Check(unicode)) {
3307 PyErr_BadArgument();
3308 return NULL;
3309 }
3310 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3311 PyUnicode_GET_SIZE(unicode),
3312 NULL);
3313}
3314
Martin v. Löwisd8251432006-06-14 05:21:04 +00003315#undef NEED_RETRY
3316
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003317#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003318
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319/* --- Character Mapping Codec -------------------------------------------- */
3320
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003322 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323 PyObject *mapping,
3324 const char *errors)
3325{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003327 Py_ssize_t startinpos;
3328 Py_ssize_t endinpos;
3329 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 PyUnicodeObject *v;
3332 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003334 PyObject *errorHandler = NULL;
3335 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003336 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003337 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 /* Default to Latin-1 */
3340 if (mapping == NULL)
3341 return PyUnicode_DecodeLatin1(s, size, errors);
3342
3343 v = _PyUnicode_New(size);
3344 if (v == NULL)
3345 goto onError;
3346 if (size == 0)
3347 return (PyObject *)v;
3348 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003350 if (PyUnicode_CheckExact(mapping)) {
3351 mapstring = PyUnicode_AS_UNICODE(mapping);
3352 maplen = PyUnicode_GET_SIZE(mapping);
3353 while (s < e) {
3354 unsigned char ch = *s;
3355 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003357 if (ch < maplen)
3358 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003360 if (x == 0xfffe) {
3361 /* undefined mapping */
3362 outpos = p-PyUnicode_AS_UNICODE(v);
3363 startinpos = s-starts;
3364 endinpos = startinpos+1;
3365 if (unicode_decode_call_errorhandler(
3366 errors, &errorHandler,
3367 "charmap", "character maps to <undefined>",
3368 starts, size, &startinpos, &endinpos, &exc, &s,
3369 (PyObject **)&v, &outpos, &p)) {
3370 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003371 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003372 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00003373 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003374 *p++ = x;
3375 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003377 }
3378 else {
3379 while (s < e) {
3380 unsigned char ch = *s;
3381 PyObject *w, *x;
3382
3383 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3384 w = PyInt_FromLong((long)ch);
3385 if (w == NULL)
3386 goto onError;
3387 x = PyObject_GetItem(mapping, w);
3388 Py_DECREF(w);
3389 if (x == NULL) {
3390 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3391 /* No mapping found means: mapping is undefined. */
3392 PyErr_Clear();
3393 x = Py_None;
3394 Py_INCREF(x);
3395 } else
3396 goto onError;
3397 }
3398
3399 /* Apply mapping */
3400 if (PyInt_Check(x)) {
3401 long value = PyInt_AS_LONG(x);
3402 if (value < 0 || value > 65535) {
3403 PyErr_SetString(PyExc_TypeError,
3404 "character mapping must be in range(65536)");
3405 Py_DECREF(x);
3406 goto onError;
3407 }
3408 *p++ = (Py_UNICODE)value;
3409 }
3410 else if (x == Py_None) {
3411 /* undefined mapping */
3412 outpos = p-PyUnicode_AS_UNICODE(v);
3413 startinpos = s-starts;
3414 endinpos = startinpos+1;
3415 if (unicode_decode_call_errorhandler(
3416 errors, &errorHandler,
3417 "charmap", "character maps to <undefined>",
3418 starts, size, &startinpos, &endinpos, &exc, &s,
3419 (PyObject **)&v, &outpos, &p)) {
3420 Py_DECREF(x);
3421 goto onError;
3422 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00003423 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003424 continue;
3425 }
3426 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003427 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003428
3429 if (targetsize == 1)
3430 /* 1-1 mapping */
3431 *p++ = *PyUnicode_AS_UNICODE(x);
3432
3433 else if (targetsize > 1) {
3434 /* 1-n mapping */
3435 if (targetsize > extrachars) {
3436 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003437 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3438 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003439 (targetsize << 2);
3440 extrachars += needed;
Armin Rigo7ccbca92006-10-04 12:17:45 +00003441 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003442 if (_PyUnicode_Resize(&v,
3443 PyUnicode_GET_SIZE(v) + needed) < 0) {
3444 Py_DECREF(x);
3445 goto onError;
3446 }
3447 p = PyUnicode_AS_UNICODE(v) + oldpos;
3448 }
3449 Py_UNICODE_COPY(p,
3450 PyUnicode_AS_UNICODE(x),
3451 targetsize);
3452 p += targetsize;
3453 extrachars -= targetsize;
3454 }
3455 /* 1-0 mapping: skip the character */
3456 }
3457 else {
3458 /* wrong return value */
3459 PyErr_SetString(PyExc_TypeError,
3460 "character mapping must return integer, None or unicode");
3461 Py_DECREF(x);
3462 goto onError;
3463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003465 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 }
3468 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Martin v. Löwis412fb672006-04-13 06:34:32 +00003469 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 Py_XDECREF(errorHandler);
3472 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003474
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 Py_XDECREF(errorHandler);
3477 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 Py_XDECREF(v);
3479 return NULL;
3480}
3481
Martin v. Löwis3f767792006-06-04 19:36:28 +00003482/* Charmap encoding: the lookup table */
3483
3484struct encoding_map{
3485 PyObject_HEAD
3486 unsigned char level1[32];
3487 int count2, count3;
3488 unsigned char level23[1];
3489};
3490
3491static PyObject*
3492encoding_map_size(PyObject *obj, PyObject* args)
3493{
3494 struct encoding_map *map = (struct encoding_map*)obj;
3495 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3496 128*map->count3);
3497}
3498
3499static PyMethodDef encoding_map_methods[] = {
3500 {"size", encoding_map_size, METH_NOARGS,
3501 PyDoc_STR("Return the size (in bytes) of this object") },
3502 { 0 }
3503};
3504
3505static void
3506encoding_map_dealloc(PyObject* o)
3507{
3508 PyObject_FREE(o);
3509}
3510
3511static PyTypeObject EncodingMapType = {
Martin v. Löwis68192102007-07-21 06:55:02 +00003512 PyVarObject_HEAD_INIT(NULL, 0)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003513 "EncodingMap", /*tp_name*/
3514 sizeof(struct encoding_map), /*tp_basicsize*/
3515 0, /*tp_itemsize*/
3516 /* methods */
3517 encoding_map_dealloc, /*tp_dealloc*/
3518 0, /*tp_print*/
3519 0, /*tp_getattr*/
3520 0, /*tp_setattr*/
3521 0, /*tp_compare*/
3522 0, /*tp_repr*/
3523 0, /*tp_as_number*/
3524 0, /*tp_as_sequence*/
3525 0, /*tp_as_mapping*/
3526 0, /*tp_hash*/
3527 0, /*tp_call*/
3528 0, /*tp_str*/
3529 0, /*tp_getattro*/
3530 0, /*tp_setattro*/
3531 0, /*tp_as_buffer*/
3532 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3533 0, /*tp_doc*/
3534 0, /*tp_traverse*/
3535 0, /*tp_clear*/
3536 0, /*tp_richcompare*/
3537 0, /*tp_weaklistoffset*/
3538 0, /*tp_iter*/
3539 0, /*tp_iternext*/
3540 encoding_map_methods, /*tp_methods*/
3541 0, /*tp_members*/
3542 0, /*tp_getset*/
3543 0, /*tp_base*/
3544 0, /*tp_dict*/
3545 0, /*tp_descr_get*/
3546 0, /*tp_descr_set*/
3547 0, /*tp_dictoffset*/
3548 0, /*tp_init*/
3549 0, /*tp_alloc*/
3550 0, /*tp_new*/
3551 0, /*tp_free*/
3552 0, /*tp_is_gc*/
3553};
3554
3555PyObject*
3556PyUnicode_BuildEncodingMap(PyObject* string)
3557{
3558 Py_UNICODE *decode;
3559 PyObject *result;
3560 struct encoding_map *mresult;
3561 int i;
3562 int need_dict = 0;
3563 unsigned char level1[32];
3564 unsigned char level2[512];
3565 unsigned char *mlevel1, *mlevel2, *mlevel3;
3566 int count2 = 0, count3 = 0;
3567
3568 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3569 PyErr_BadArgument();
3570 return NULL;
3571 }
3572 decode = PyUnicode_AS_UNICODE(string);
3573 memset(level1, 0xFF, sizeof level1);
3574 memset(level2, 0xFF, sizeof level2);
3575
3576 /* If there isn't a one-to-one mapping of NULL to \0,
3577 or if there are non-BMP characters, we need to use
3578 a mapping dictionary. */
3579 if (decode[0] != 0)
3580 need_dict = 1;
3581 for (i = 1; i < 256; i++) {
3582 int l1, l2;
3583 if (decode[i] == 0
3584 #ifdef Py_UNICODE_WIDE
3585 || decode[i] > 0xFFFF
3586 #endif
3587 ) {
3588 need_dict = 1;
3589 break;
3590 }
3591 if (decode[i] == 0xFFFE)
3592 /* unmapped character */
3593 continue;
3594 l1 = decode[i] >> 11;
3595 l2 = decode[i] >> 7;
3596 if (level1[l1] == 0xFF)
3597 level1[l1] = count2++;
3598 if (level2[l2] == 0xFF)
3599 level2[l2] = count3++;
3600 }
3601
3602 if (count2 >= 0xFF || count3 >= 0xFF)
3603 need_dict = 1;
3604
3605 if (need_dict) {
3606 PyObject *result = PyDict_New();
3607 PyObject *key, *value;
3608 if (!result)
3609 return NULL;
3610 for (i = 0; i < 256; i++) {
3611 key = value = NULL;
3612 key = PyInt_FromLong(decode[i]);
3613 value = PyInt_FromLong(i);
3614 if (!key || !value)
3615 goto failed1;
3616 if (PyDict_SetItem(result, key, value) == -1)
3617 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00003618 Py_DECREF(key);
3619 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003620 }
3621 return result;
3622 failed1:
3623 Py_XDECREF(key);
3624 Py_XDECREF(value);
3625 Py_DECREF(result);
3626 return NULL;
3627 }
3628
3629 /* Create a three-level trie */
3630 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3631 16*count2 + 128*count3 - 1);
3632 if (!result)
3633 return PyErr_NoMemory();
3634 PyObject_Init(result, &EncodingMapType);
3635 mresult = (struct encoding_map*)result;
3636 mresult->count2 = count2;
3637 mresult->count3 = count3;
3638 mlevel1 = mresult->level1;
3639 mlevel2 = mresult->level23;
3640 mlevel3 = mresult->level23 + 16*count2;
3641 memcpy(mlevel1, level1, 32);
3642 memset(mlevel2, 0xFF, 16*count2);
3643 memset(mlevel3, 0, 128*count3);
3644 count3 = 0;
3645 for (i = 1; i < 256; i++) {
3646 int o1, o2, o3, i2, i3;
3647 if (decode[i] == 0xFFFE)
3648 /* unmapped character */
3649 continue;
3650 o1 = decode[i]>>11;
3651 o2 = (decode[i]>>7) & 0xF;
3652 i2 = 16*mlevel1[o1] + o2;
3653 if (mlevel2[i2] == 0xFF)
3654 mlevel2[i2] = count3++;
3655 o3 = decode[i] & 0x7F;
3656 i3 = 128*mlevel2[i2] + o3;
3657 mlevel3[i3] = i;
3658 }
3659 return result;
3660}
3661
3662static int
3663encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3664{
3665 struct encoding_map *map = (struct encoding_map*)mapping;
3666 int l1 = c>>11;
3667 int l2 = (c>>7) & 0xF;
3668 int l3 = c & 0x7F;
3669 int i;
3670
3671#ifdef Py_UNICODE_WIDE
3672 if (c > 0xFFFF) {
3673 return -1;
3674 }
3675#endif
3676 if (c == 0)
3677 return 0;
3678 /* level 1*/
3679 i = map->level1[l1];
3680 if (i == 0xFF) {
3681 return -1;
3682 }
3683 /* level 2*/
3684 i = map->level23[16*i+l2];
3685 if (i == 0xFF) {
3686 return -1;
3687 }
3688 /* level 3 */
3689 i = map->level23[16*map->count2 + 128*i + l3];
3690 if (i == 0) {
3691 return -1;
3692 }
3693 return i;
3694}
3695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696/* Lookup the character ch in the mapping. If the character
3697 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00003698 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 PyObject *w = PyInt_FromLong((long)c);
3702 PyObject *x;
3703
3704 if (w == NULL)
3705 return NULL;
3706 x = PyObject_GetItem(mapping, w);
3707 Py_DECREF(w);
3708 if (x == NULL) {
3709 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3710 /* No mapping found means: mapping is undefined. */
3711 PyErr_Clear();
3712 x = Py_None;
3713 Py_INCREF(x);
3714 return x;
3715 } else
3716 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003718 else if (x == Py_None)
3719 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 else if (PyInt_Check(x)) {
3721 long value = PyInt_AS_LONG(x);
3722 if (value < 0 || value > 255) {
3723 PyErr_SetString(PyExc_TypeError,
3724 "character mapping must be in range(256)");
3725 Py_DECREF(x);
3726 return NULL;
3727 }
3728 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 else if (PyString_Check(x))
3731 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 /* wrong return value */
3734 PyErr_SetString(PyExc_TypeError,
3735 "character mapping must return integer, None or str");
3736 Py_DECREF(x);
3737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
3739}
3740
Martin v. Löwis3f767792006-06-04 19:36:28 +00003741static int
3742charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3743{
3744 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3745 /* exponentially overallocate to minimize reallocations */
3746 if (requiredsize < 2*outsize)
3747 requiredsize = 2*outsize;
3748 if (_PyString_Resize(outobj, requiredsize)) {
3749 return 0;
3750 }
3751 return 1;
3752}
3753
3754typedef enum charmapencode_result {
3755 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3756}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757/* lookup the character, put the result in the output string and adjust
3758 various state variables. Reallocate the output string if not enough
3759 space is available. Return a new reference to the object that
3760 was put in the output buffer, or Py_None, if the mapping was undefined
3761 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003762 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763static
Martin v. Löwis3f767792006-06-04 19:36:28 +00003764charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003765 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766{
Martin v. Löwis3f767792006-06-04 19:36:28 +00003767 PyObject *rep;
3768 char *outstart;
3769 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770
Martin v. Löwis68192102007-07-21 06:55:02 +00003771 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003772 int res = encoding_map_lookup(c, mapping);
3773 Py_ssize_t requiredsize = *outpos+1;
3774 if (res == -1)
3775 return enc_FAILED;
3776 if (outsize<requiredsize)
3777 if (!charmapencode_resize(outobj, outpos, requiredsize))
3778 return enc_EXCEPTION;
3779 outstart = PyString_AS_STRING(*outobj);
3780 outstart[(*outpos)++] = (char)res;
3781 return enc_SUCCESS;
3782 }
3783
3784 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 if (rep==NULL)
Martin v. Löwis3f767792006-06-04 19:36:28 +00003786 return enc_EXCEPTION;
3787 else if (rep==Py_None) {
3788 Py_DECREF(rep);
3789 return enc_FAILED;
3790 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 if (PyInt_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003792 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003793 if (outsize<requiredsize)
3794 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003796 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003798 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3800 }
3801 else {
3802 const char *repchars = PyString_AS_STRING(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00003803 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3804 Py_ssize_t requiredsize = *outpos+repsize;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003805 if (outsize<requiredsize)
3806 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003808 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003810 outstart = PyString_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 memcpy(outstart + *outpos, repchars, repsize);
3812 *outpos += repsize;
3813 }
3814 }
Georg Brandl9f167602006-06-04 21:46:16 +00003815 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003816 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817}
3818
3819/* handle an error in PyUnicode_EncodeCharmap
3820 Return 0 on success, -1 on error */
3821static
3822int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00003823 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003824 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003825 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003826 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827{
3828 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003829 Py_ssize_t repsize;
3830 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 Py_UNICODE *uni2;
3832 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003833 Py_ssize_t collstartpos = *inpos;
3834 Py_ssize_t collendpos = *inpos+1;
3835 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 char *encoding = "charmap";
3837 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00003838 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003840 /* find all unencodable characters */
3841 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003842 PyObject *rep;
Martin v. Löwis68192102007-07-21 06:55:02 +00003843 if (Py_Type(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00003844 int res = encoding_map_lookup(p[collendpos], mapping);
3845 if (res != -1)
3846 break;
3847 ++collendpos;
3848 continue;
3849 }
3850
3851 rep = charmapencode_lookup(p[collendpos], mapping);
3852 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003854 else if (rep!=Py_None) {
3855 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 break;
3857 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003858 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003859 ++collendpos;
3860 }
3861 /* cache callback name lookup
3862 * (if not done yet, i.e. it's the first error) */
3863 if (*known_errorHandler==-1) {
3864 if ((errors==NULL) || (!strcmp(errors, "strict")))
3865 *known_errorHandler = 1;
3866 else if (!strcmp(errors, "replace"))
3867 *known_errorHandler = 2;
3868 else if (!strcmp(errors, "ignore"))
3869 *known_errorHandler = 3;
3870 else if (!strcmp(errors, "xmlcharrefreplace"))
3871 *known_errorHandler = 4;
3872 else
3873 *known_errorHandler = 0;
3874 }
3875 switch (*known_errorHandler) {
3876 case 1: /* strict */
3877 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3878 return -1;
3879 case 2: /* replace */
3880 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3881 x = charmapencode_output('?', mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003882 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 return -1;
3884 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003885 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3887 return -1;
3888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003889 }
3890 /* fall through */
3891 case 3: /* ignore */
3892 *inpos = collendpos;
3893 break;
3894 case 4: /* xmlcharrefreplace */
3895 /* generate replacement (temporarily (mis)uses p) */
3896 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3897 char buffer[2+29+1+1];
3898 char *cp;
3899 sprintf(buffer, "&#%d;", (int)p[collpos]);
3900 for (cp = buffer; *cp; ++cp) {
3901 x = charmapencode_output(*cp, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003902 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003904 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003905 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3906 return -1;
3907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 }
3909 }
3910 *inpos = collendpos;
3911 break;
3912 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003913 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 encoding, reason, p, size, exceptionObject,
3915 collstartpos, collendpos, &newpos);
3916 if (repunicode == NULL)
3917 return -1;
3918 /* generate replacement */
3919 repsize = PyUnicode_GET_SIZE(repunicode);
3920 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3921 x = charmapencode_output(*uni2, mapping, res, respos);
Martin v. Löwis3f767792006-06-04 19:36:28 +00003922 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 return -1;
3924 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00003925 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003926 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3928 return -1;
3929 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 }
3931 *inpos = newpos;
3932 Py_DECREF(repunicode);
3933 }
3934 return 0;
3935}
3936
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003938 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 PyObject *mapping,
3940 const char *errors)
3941{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 /* output object */
3943 PyObject *res = NULL;
3944 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003945 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003947 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003948 PyObject *errorHandler = NULL;
3949 PyObject *exc = NULL;
3950 /* the following variable is used for caching string comparisons
3951 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3952 * 3=ignore, 4=xmlcharrefreplace */
3953 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954
3955 /* Default to Latin-1 */
3956 if (mapping == NULL)
3957 return PyUnicode_EncodeLatin1(p, size, errors);
3958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959 /* allocate enough for a simple encoding without
3960 replacements, if we need more, we'll resize */
3961 res = PyString_FromStringAndSize(NULL, size);
3962 if (res == NULL)
3963 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003964 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 while (inpos<size) {
3968 /* try to encode it */
Martin v. Löwis3f767792006-06-04 19:36:28 +00003969 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3970 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 goto onError;
Martin v. Löwis3f767792006-06-04 19:36:28 +00003972 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 if (charmap_encoding_error(p, size, &inpos, mapping,
3974 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003975 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003976 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003977 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 else
3981 /* done with this character => adjust input position */
3982 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 /* Resize if we allocated to much */
3986 if (respos<PyString_GET_SIZE(res)) {
3987 if (_PyString_Resize(&res, respos))
3988 goto onError;
3989 }
3990 Py_XDECREF(exc);
3991 Py_XDECREF(errorHandler);
3992 return res;
3993
3994 onError:
3995 Py_XDECREF(res);
3996 Py_XDECREF(exc);
3997 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 return NULL;
3999}
4000
4001PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4002 PyObject *mapping)
4003{
4004 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4005 PyErr_BadArgument();
4006 return NULL;
4007 }
4008 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4009 PyUnicode_GET_SIZE(unicode),
4010 mapping,
4011 NULL);
4012}
4013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014/* create or adjust a UnicodeTranslateError */
4015static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004016 const Py_UNICODE *unicode, Py_ssize_t size,
4017 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004018 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 if (*exceptionObject == NULL) {
4021 *exceptionObject = PyUnicodeTranslateError_Create(
4022 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 }
4024 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4026 goto onError;
4027 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4028 goto onError;
4029 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4030 goto onError;
4031 return;
4032 onError:
4033 Py_DECREF(*exceptionObject);
4034 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 }
4036}
4037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038/* raises a UnicodeTranslateError */
4039static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004040 const Py_UNICODE *unicode, Py_ssize_t size,
4041 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 const char *reason)
4043{
4044 make_translate_exception(exceptionObject,
4045 unicode, size, startpos, endpos, reason);
4046 if (*exceptionObject != NULL)
4047 PyCodec_StrictErrors(*exceptionObject);
4048}
4049
4050/* error handling callback helper:
4051 build arguments, call the callback and check the arguments,
4052 put the result into newpos and return the replacement string, which
4053 has to be freed by the caller */
4054static PyObject *unicode_translate_call_errorhandler(const char *errors,
4055 PyObject **errorHandler,
4056 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004057 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4058 Py_ssize_t startpos, Py_ssize_t endpos,
4059 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004061 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062
Martin v. Löwis412fb672006-04-13 06:34:32 +00004063 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 PyObject *restuple;
4065 PyObject *resunicode;
4066
4067 if (*errorHandler == NULL) {
4068 *errorHandler = PyCodec_LookupError(errors);
4069 if (*errorHandler == NULL)
4070 return NULL;
4071 }
4072
4073 make_translate_exception(exceptionObject,
4074 unicode, size, startpos, endpos, reason);
4075 if (*exceptionObject == NULL)
4076 return NULL;
4077
4078 restuple = PyObject_CallFunctionObjArgs(
4079 *errorHandler, *exceptionObject, NULL);
4080 if (restuple == NULL)
4081 return NULL;
4082 if (!PyTuple_Check(restuple)) {
4083 PyErr_Format(PyExc_TypeError, &argparse[4]);
4084 Py_DECREF(restuple);
4085 return NULL;
4086 }
4087 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004088 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 Py_DECREF(restuple);
4090 return NULL;
4091 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004092 if (i_newpos<0)
4093 *newpos = size+i_newpos;
4094 else
4095 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004096 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00004097 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004098 Py_DECREF(restuple);
4099 return NULL;
4100 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 Py_INCREF(resunicode);
4102 Py_DECREF(restuple);
4103 return resunicode;
4104}
4105
4106/* Lookup the character ch in the mapping and put the result in result,
4107 which must be decrefed by the caller.
4108 Return 0 on success, -1 on error */
4109static
4110int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4111{
4112 PyObject *w = PyInt_FromLong((long)c);
4113 PyObject *x;
4114
4115 if (w == NULL)
4116 return -1;
4117 x = PyObject_GetItem(mapping, w);
4118 Py_DECREF(w);
4119 if (x == NULL) {
4120 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121 /* No mapping found means: use 1:1 mapping. */
4122 PyErr_Clear();
4123 *result = NULL;
4124 return 0;
4125 } else
4126 return -1;
4127 }
4128 else if (x == Py_None) {
4129 *result = x;
4130 return 0;
4131 }
4132 else if (PyInt_Check(x)) {
4133 long value = PyInt_AS_LONG(x);
4134 long max = PyUnicode_GetMax();
4135 if (value < 0 || value > max) {
4136 PyErr_Format(PyExc_TypeError,
4137 "character mapping must be in range(0x%lx)", max+1);
4138 Py_DECREF(x);
4139 return -1;
4140 }
4141 *result = x;
4142 return 0;
4143 }
4144 else if (PyUnicode_Check(x)) {
4145 *result = x;
4146 return 0;
4147 }
4148 else {
4149 /* wrong return value */
4150 PyErr_SetString(PyExc_TypeError,
4151 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00004152 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 return -1;
4154 }
4155}
4156/* ensure that *outobj is at least requiredsize characters long,
4157if not reallocate and adjust various state variables.
4158Return 0 on success, -1 on error */
4159static
Walter Dörwald4894c302003-10-24 14:25:28 +00004160int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004161 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004163 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004164 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004166 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00004168 if (requiredsize < 2 * oldsize)
4169 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004170 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 return -1;
4172 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 }
4174 return 0;
4175}
4176/* lookup the character, put the result in the output string and adjust
4177 various state variables. Return a new reference to the object that
4178 was put in the output buffer in *result, or Py_None, if the mapping was
4179 undefined (in which case no character was written).
4180 The called must decref result.
4181 Return 0 on success, -1 on error. */
4182static
Walter Dörwald4894c302003-10-24 14:25:28 +00004183int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004184 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00004185 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186{
Walter Dörwald4894c302003-10-24 14:25:28 +00004187 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 return -1;
4189 if (*res==NULL) {
4190 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00004191 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 }
4193 else if (*res==Py_None)
4194 ;
4195 else if (PyInt_Check(*res)) {
4196 /* no overflow check, because we know that the space is enough */
4197 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4198 }
4199 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004200 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 if (repsize==1) {
4202 /* no overflow check, because we know that the space is enough */
4203 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4204 }
4205 else if (repsize!=0) {
4206 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004207 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00004208 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00004209 repsize - 1;
4210 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 return -1;
4212 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4213 *outp += repsize;
4214 }
4215 }
4216 else
4217 return -1;
4218 return 0;
4219}
4220
4221PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004222 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 PyObject *mapping,
4224 const char *errors)
4225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 /* output object */
4227 PyObject *res = NULL;
4228 /* pointers to the beginning and end+1 of input */
4229 const Py_UNICODE *startp = p;
4230 const Py_UNICODE *endp = p + size;
4231 /* pointer into the output */
4232 Py_UNICODE *str;
4233 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004234 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 char *reason = "character maps to <undefined>";
4236 PyObject *errorHandler = NULL;
4237 PyObject *exc = NULL;
4238 /* the following variable is used for caching string comparisons
4239 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4240 * 3=ignore, 4=xmlcharrefreplace */
4241 int known_errorHandler = -1;
4242
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243 if (mapping == NULL) {
4244 PyErr_BadArgument();
4245 return NULL;
4246 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247
4248 /* allocate enough for a simple 1:1 translation without
4249 replacements, if we need more, we'll resize */
4250 res = PyUnicode_FromUnicode(NULL, size);
4251 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00004252 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 return res;
4255 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 while (p<endp) {
4258 /* try to encode it */
4259 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00004260 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 goto onError;
4263 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00004264 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265 if (x!=Py_None) /* it worked => adjust input pointer */
4266 ++p;
4267 else { /* untranslatable character */
4268 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004269 Py_ssize_t repsize;
4270 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 Py_UNICODE *uni2;
4272 /* startpos for collecting untranslatable chars */
4273 const Py_UNICODE *collstart = p;
4274 const Py_UNICODE *collend = p+1;
4275 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 /* find all untranslatable characters */
4278 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00004279 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 goto onError;
4281 Py_XDECREF(x);
4282 if (x!=Py_None)
4283 break;
4284 ++collend;
4285 }
4286 /* cache callback name lookup
4287 * (if not done yet, i.e. it's the first error) */
4288 if (known_errorHandler==-1) {
4289 if ((errors==NULL) || (!strcmp(errors, "strict")))
4290 known_errorHandler = 1;
4291 else if (!strcmp(errors, "replace"))
4292 known_errorHandler = 2;
4293 else if (!strcmp(errors, "ignore"))
4294 known_errorHandler = 3;
4295 else if (!strcmp(errors, "xmlcharrefreplace"))
4296 known_errorHandler = 4;
4297 else
4298 known_errorHandler = 0;
4299 }
4300 switch (known_errorHandler) {
4301 case 1: /* strict */
4302 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4303 goto onError;
4304 case 2: /* replace */
4305 /* No need to check for space, this is a 1:1 replacement */
4306 for (coll = collstart; coll<collend; ++coll)
4307 *str++ = '?';
4308 /* fall through */
4309 case 3: /* ignore */
4310 p = collend;
4311 break;
4312 case 4: /* xmlcharrefreplace */
4313 /* generate replacement (temporarily (mis)uses p) */
4314 for (p = collstart; p < collend; ++p) {
4315 char buffer[2+29+1+1];
4316 char *cp;
4317 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00004318 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4320 goto onError;
4321 for (cp = buffer; *cp; ++cp)
4322 *str++ = *cp;
4323 }
4324 p = collend;
4325 break;
4326 default:
4327 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4328 reason, startp, size, &exc,
4329 collstart-startp, collend-startp, &newpos);
4330 if (repunicode == NULL)
4331 goto onError;
4332 /* generate replacement */
4333 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00004334 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4336 Py_DECREF(repunicode);
4337 goto onError;
4338 }
4339 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4340 *str++ = *uni2;
4341 p = startp + newpos;
4342 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 }
4344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 /* Resize if we allocated to much */
4347 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004348 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004349 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004350 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351 }
4352 Py_XDECREF(exc);
4353 Py_XDECREF(errorHandler);
4354 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 onError:
4357 Py_XDECREF(res);
4358 Py_XDECREF(exc);
4359 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 return NULL;
4361}
4362
4363PyObject *PyUnicode_Translate(PyObject *str,
4364 PyObject *mapping,
4365 const char *errors)
4366{
4367 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00004368
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 str = PyUnicode_FromObject(str);
4370 if (str == NULL)
4371 goto onError;
4372 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4373 PyUnicode_GET_SIZE(str),
4374 mapping,
4375 errors);
4376 Py_DECREF(str);
4377 return result;
Tim Petersced69f82003-09-16 20:30:58 +00004378
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 onError:
4380 Py_XDECREF(str);
4381 return NULL;
4382}
Tim Petersced69f82003-09-16 20:30:58 +00004383
Guido van Rossum9e896b32000-04-05 20:11:21 +00004384/* --- Decimal Encoder ---------------------------------------------------- */
4385
4386int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004387 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00004388 char *output,
4389 const char *errors)
4390{
4391 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004392 PyObject *errorHandler = NULL;
4393 PyObject *exc = NULL;
4394 const char *encoding = "decimal";
4395 const char *reason = "invalid decimal Unicode string";
4396 /* the following variable is used for caching string comparisons
4397 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4398 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004399
4400 if (output == NULL) {
4401 PyErr_BadArgument();
4402 return -1;
4403 }
4404
4405 p = s;
4406 end = s + length;
4407 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004409 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004411 Py_ssize_t repsize;
4412 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 Py_UNICODE *uni2;
4414 Py_UNICODE *collstart;
4415 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00004416
Guido van Rossum9e896b32000-04-05 20:11:21 +00004417 if (Py_UNICODE_ISSPACE(ch)) {
4418 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004420 continue;
4421 }
4422 decimal = Py_UNICODE_TODECIMAL(ch);
4423 if (decimal >= 0) {
4424 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004426 continue;
4427 }
Guido van Rossumba477042000-04-06 18:18:10 +00004428 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00004429 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004431 continue;
4432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 /* All other characters are considered unencodable */
4434 collstart = p;
4435 collend = p+1;
4436 while (collend < end) {
4437 if ((0 < *collend && *collend < 256) ||
4438 !Py_UNICODE_ISSPACE(*collend) ||
4439 Py_UNICODE_TODECIMAL(*collend))
4440 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00004441 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 /* cache callback name lookup
4443 * (if not done yet, i.e. it's the first error) */
4444 if (known_errorHandler==-1) {
4445 if ((errors==NULL) || (!strcmp(errors, "strict")))
4446 known_errorHandler = 1;
4447 else if (!strcmp(errors, "replace"))
4448 known_errorHandler = 2;
4449 else if (!strcmp(errors, "ignore"))
4450 known_errorHandler = 3;
4451 else if (!strcmp(errors, "xmlcharrefreplace"))
4452 known_errorHandler = 4;
4453 else
4454 known_errorHandler = 0;
4455 }
4456 switch (known_errorHandler) {
4457 case 1: /* strict */
4458 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4459 goto onError;
4460 case 2: /* replace */
4461 for (p = collstart; p < collend; ++p)
4462 *output++ = '?';
4463 /* fall through */
4464 case 3: /* ignore */
4465 p = collend;
4466 break;
4467 case 4: /* xmlcharrefreplace */
4468 /* generate replacement (temporarily (mis)uses p) */
4469 for (p = collstart; p < collend; ++p)
4470 output += sprintf(output, "&#%d;", (int)*p);
4471 p = collend;
4472 break;
4473 default:
4474 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4475 encoding, reason, s, length, &exc,
4476 collstart-s, collend-s, &newpos);
4477 if (repunicode == NULL)
4478 goto onError;
4479 /* generate replacement */
4480 repsize = PyUnicode_GET_SIZE(repunicode);
4481 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4482 Py_UNICODE ch = *uni2;
4483 if (Py_UNICODE_ISSPACE(ch))
4484 *output++ = ' ';
4485 else {
4486 decimal = Py_UNICODE_TODECIMAL(ch);
4487 if (decimal >= 0)
4488 *output++ = '0' + decimal;
4489 else if (0 < ch && ch < 256)
4490 *output++ = (char)ch;
4491 else {
4492 Py_DECREF(repunicode);
4493 raise_encode_exception(&exc, encoding,
4494 s, length, collstart-s, collend-s, reason);
4495 goto onError;
4496 }
4497 }
4498 }
4499 p = s + newpos;
4500 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004501 }
4502 }
4503 /* 0-terminate the output string */
4504 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 Py_XDECREF(exc);
4506 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004507 return 0;
4508
4509 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 Py_XDECREF(exc);
4511 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00004512 return -1;
4513}
4514
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515/* --- Helpers ------------------------------------------------------------ */
4516
Fredrik Lundha50d2012006-05-26 17:04:58 +00004517#define STRINGLIB_CHAR Py_UNICODE
Fredrik Lundh6471ee42006-05-24 14:28:11 +00004518
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004519#define STRINGLIB_LEN PyUnicode_GET_SIZE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004520#define STRINGLIB_NEW PyUnicode_FromUnicode
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004521#define STRINGLIB_STR PyUnicode_AS_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00004522
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00004523Py_LOCAL_INLINE(int)
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004524STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4525{
Fredrik Lundh9c0e9c02006-05-26 18:24:15 +00004526 if (str[0] != other[0])
4527 return 1;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00004528 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4529}
4530
Fredrik Lundhb9479482006-05-26 17:22:38 +00004531#define STRINGLIB_EMPTY unicode_empty
4532
Fredrik Lundha50d2012006-05-26 17:04:58 +00004533#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004534
4535#include "stringlib/count.h"
4536#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00004537#include "stringlib/partition.h"
4538
Fredrik Lundhc8162812006-05-26 19:33:03 +00004539/* helper macro to fixup start/end slice values */
4540#define FIX_START_END(obj) \
4541 if (start < 0) \
4542 start += (obj)->length; \
4543 if (start < 0) \
4544 start = 0; \
4545 if (end > (obj)->length) \
4546 end = (obj)->length; \
4547 if (end < 0) \
4548 end += (obj)->length; \
4549 if (end < 0) \
4550 end = 0;
4551
Martin v. Löwis18e16552006-02-15 17:27:45 +00004552Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004553 PyObject *substr,
4554 Py_ssize_t start,
4555 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004557 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004558 PyUnicodeObject* str_obj;
4559 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00004560
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004561 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4562 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004564 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4565 if (!sub_obj) {
4566 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 return -1;
4568 }
Tim Petersced69f82003-09-16 20:30:58 +00004569
Fredrik Lundhc8162812006-05-26 19:33:03 +00004570 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00004571
Fredrik Lundh58b5e842006-05-26 19:24:53 +00004572 result = stringlib_count(
4573 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4574 );
4575
4576 Py_DECREF(sub_obj);
4577 Py_DECREF(str_obj);
4578
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 return result;
4580}
4581
Martin v. Löwis18e16552006-02-15 17:27:45 +00004582Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004583 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004584 Py_ssize_t start,
4585 Py_ssize_t end,
4586 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004588 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004589
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004590 str = PyUnicode_FromObject(str);
4591 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004592 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004593 sub = PyUnicode_FromObject(sub);
4594 if (!sub) {
4595 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00004596 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 }
Tim Petersced69f82003-09-16 20:30:58 +00004598
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004599 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004600 result = stringlib_find_slice(
4601 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4602 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4603 start, end
4604 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004605 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00004606 result = stringlib_rfind_slice(
4607 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4608 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4609 start, end
4610 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00004611
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00004612 Py_DECREF(str);
4613 Py_DECREF(sub);
4614
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615 return result;
4616}
4617
Tim Petersced69f82003-09-16 20:30:58 +00004618static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619int tailmatch(PyUnicodeObject *self,
4620 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004621 Py_ssize_t start,
4622 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 int direction)
4624{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 if (substring->length == 0)
4626 return 1;
4627
Fredrik Lundhc8162812006-05-26 19:33:03 +00004628 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629
4630 end -= substring->length;
4631 if (end < start)
4632 return 0;
4633
4634 if (direction > 0) {
4635 if (Py_UNICODE_MATCH(self, end, substring))
4636 return 1;
4637 } else {
4638 if (Py_UNICODE_MATCH(self, start, substring))
4639 return 1;
4640 }
4641
4642 return 0;
4643}
4644
Martin v. Löwis18e16552006-02-15 17:27:45 +00004645Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t start,
4648 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 int direction)
4650{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004651 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00004652
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 str = PyUnicode_FromObject(str);
4654 if (str == NULL)
4655 return -1;
4656 substr = PyUnicode_FromObject(substr);
4657 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004658 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 return -1;
4660 }
Tim Petersced69f82003-09-16 20:30:58 +00004661
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 result = tailmatch((PyUnicodeObject *)str,
4663 (PyUnicodeObject *)substr,
4664 start, end, direction);
4665 Py_DECREF(str);
4666 Py_DECREF(substr);
4667 return result;
4668}
4669
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670/* Apply fixfct filter to the Unicode object self and return a
4671 reference to the modified object */
4672
Tim Petersced69f82003-09-16 20:30:58 +00004673static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674PyObject *fixup(PyUnicodeObject *self,
4675 int (*fixfct)(PyUnicodeObject *s))
4676{
4677
4678 PyUnicodeObject *u;
4679
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004680 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 if (u == NULL)
4682 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004683
4684 Py_UNICODE_COPY(u->str, self->str, self->length);
4685
Tim Peters7a29bd52001-09-12 03:03:31 +00004686 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 /* fixfct should return TRUE if it modified the buffer. If
4688 FALSE, return a reference to the original buffer instead
4689 (to save space, not time) */
4690 Py_INCREF(self);
4691 Py_DECREF(u);
4692 return (PyObject*) self;
4693 }
4694 return (PyObject*) u;
4695}
4696
Tim Petersced69f82003-09-16 20:30:58 +00004697static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698int fixupper(PyUnicodeObject *self)
4699{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004700 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 Py_UNICODE *s = self->str;
4702 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004703
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 while (len-- > 0) {
4705 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004706
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 ch = Py_UNICODE_TOUPPER(*s);
4708 if (ch != *s) {
4709 status = 1;
4710 *s = ch;
4711 }
4712 s++;
4713 }
4714
4715 return status;
4716}
4717
Tim Petersced69f82003-09-16 20:30:58 +00004718static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719int fixlower(PyUnicodeObject *self)
4720{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004721 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 Py_UNICODE *s = self->str;
4723 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004724
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 while (len-- > 0) {
4726 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004727
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 ch = Py_UNICODE_TOLOWER(*s);
4729 if (ch != *s) {
4730 status = 1;
4731 *s = ch;
4732 }
4733 s++;
4734 }
4735
4736 return status;
4737}
4738
Tim Petersced69f82003-09-16 20:30:58 +00004739static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740int fixswapcase(PyUnicodeObject *self)
4741{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004742 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 Py_UNICODE *s = self->str;
4744 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 while (len-- > 0) {
4747 if (Py_UNICODE_ISUPPER(*s)) {
4748 *s = Py_UNICODE_TOLOWER(*s);
4749 status = 1;
4750 } else if (Py_UNICODE_ISLOWER(*s)) {
4751 *s = Py_UNICODE_TOUPPER(*s);
4752 status = 1;
4753 }
4754 s++;
4755 }
4756
4757 return status;
4758}
4759
Tim Petersced69f82003-09-16 20:30:58 +00004760static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761int fixcapitalize(PyUnicodeObject *self)
4762{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004763 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004764 Py_UNICODE *s = self->str;
4765 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004766
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004767 if (len == 0)
4768 return 0;
4769 if (Py_UNICODE_ISLOWER(*s)) {
4770 *s = Py_UNICODE_TOUPPER(*s);
4771 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004773 s++;
4774 while (--len > 0) {
4775 if (Py_UNICODE_ISUPPER(*s)) {
4776 *s = Py_UNICODE_TOLOWER(*s);
4777 status = 1;
4778 }
4779 s++;
4780 }
4781 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782}
4783
4784static
4785int fixtitle(PyUnicodeObject *self)
4786{
4787 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4788 register Py_UNICODE *e;
4789 int previous_is_cased;
4790
4791 /* Shortcut for single character strings */
4792 if (PyUnicode_GET_SIZE(self) == 1) {
4793 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4794 if (*p != ch) {
4795 *p = ch;
4796 return 1;
4797 }
4798 else
4799 return 0;
4800 }
Tim Petersced69f82003-09-16 20:30:58 +00004801
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 e = p + PyUnicode_GET_SIZE(self);
4803 previous_is_cased = 0;
4804 for (; p < e; p++) {
4805 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 if (previous_is_cased)
4808 *p = Py_UNICODE_TOLOWER(ch);
4809 else
4810 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004811
4812 if (Py_UNICODE_ISLOWER(ch) ||
4813 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 Py_UNICODE_ISTITLE(ch))
4815 previous_is_cased = 1;
4816 else
4817 previous_is_cased = 0;
4818 }
4819 return 1;
4820}
4821
Tim Peters8ce9f162004-08-27 01:49:32 +00004822PyObject *
4823PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824{
Tim Peters8ce9f162004-08-27 01:49:32 +00004825 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004826 const Py_UNICODE blank = ' ';
4827 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00004828 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004829 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00004830 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4831 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00004832 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4833 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004834 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004835 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00004836 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837
Tim Peters05eba1f2004-08-27 21:32:02 +00004838 fseq = PySequence_Fast(seq, "");
4839 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004840 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004841 }
4842
Tim Peters91879ab2004-08-27 22:35:44 +00004843 /* Grrrr. A codec may be invoked to convert str objects to
4844 * Unicode, and so it's possible to call back into Python code
4845 * during PyUnicode_FromObject(), and so it's possible for a sick
4846 * codec to change the size of fseq (if seq is a list). Therefore
4847 * we have to keep refetching the size -- can't assume seqlen
4848 * is invariant.
4849 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004850 seqlen = PySequence_Fast_GET_SIZE(fseq);
4851 /* If empty sequence, return u"". */
4852 if (seqlen == 0) {
4853 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4854 goto Done;
4855 }
4856 /* If singleton sequence with an exact Unicode, return that. */
4857 if (seqlen == 1) {
4858 item = PySequence_Fast_GET_ITEM(fseq, 0);
4859 if (PyUnicode_CheckExact(item)) {
4860 Py_INCREF(item);
4861 res = (PyUnicodeObject *)item;
4862 goto Done;
4863 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004864 }
4865
Tim Peters05eba1f2004-08-27 21:32:02 +00004866 /* At least two items to join, or one that isn't exact Unicode. */
4867 if (seqlen > 1) {
4868 /* Set up sep and seplen -- they're needed. */
4869 if (separator == NULL) {
4870 sep = &blank;
4871 seplen = 1;
4872 }
4873 else {
4874 internal_separator = PyUnicode_FromObject(separator);
4875 if (internal_separator == NULL)
4876 goto onError;
4877 sep = PyUnicode_AS_UNICODE(internal_separator);
4878 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004879 /* In case PyUnicode_FromObject() mutated seq. */
4880 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004881 }
4882 }
4883
4884 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004885 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00004886 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004887 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004888 res_p = PyUnicode_AS_UNICODE(res);
4889 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004890
Tim Peters05eba1f2004-08-27 21:32:02 +00004891 for (i = 0; i < seqlen; ++i) {
Tim Peters286085c2006-05-22 19:17:04 +00004892 Py_ssize_t itemlen;
4893 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00004894
4895 item = PySequence_Fast_GET_ITEM(fseq, i);
4896 /* Convert item to Unicode. */
4897 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4898 PyErr_Format(PyExc_TypeError,
Thomas Wouters715a4cd2006-04-16 22:04:49 +00004899 "sequence item %zd: expected string or Unicode,"
Tim Peters05eba1f2004-08-27 21:32:02 +00004900 " %.80s found",
Martin v. Löwis68192102007-07-21 06:55:02 +00004901 i, Py_Type(item)->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004902 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004903 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004904 item = PyUnicode_FromObject(item);
4905 if (item == NULL)
4906 goto onError;
4907 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004908
Tim Peters91879ab2004-08-27 22:35:44 +00004909 /* In case PyUnicode_FromObject() mutated seq. */
4910 seqlen = PySequence_Fast_GET_SIZE(fseq);
4911
Tim Peters8ce9f162004-08-27 01:49:32 +00004912 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004914 new_res_used = res_used + itemlen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004915 if (new_res_used < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004916 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004917 if (i < seqlen - 1) {
4918 new_res_used += seplen;
Georg Brandl90e27d32006-06-10 06:40:50 +00004919 if (new_res_used < 0)
Tim Peters05eba1f2004-08-27 21:32:02 +00004920 goto Overflow;
4921 }
4922 if (new_res_used > res_alloc) {
4923 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004924 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004925 res_alloc += res_alloc;
Tim Peters286085c2006-05-22 19:17:04 +00004926 if (res_alloc <= 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004927 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004928 } while (new_res_used > res_alloc);
Martin v. Löwis412fb672006-04-13 06:34:32 +00004929 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004930 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004932 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004933 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004935
4936 /* Copy item, and maybe the separator. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004937 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004938 res_p += itemlen;
4939 if (i < seqlen - 1) {
Martin v. Löwis412fb672006-04-13 06:34:32 +00004940 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00004941 res_p += seplen;
4942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004944 res_used = new_res_used;
4945 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004946
Tim Peters05eba1f2004-08-27 21:32:02 +00004947 /* Shrink res to match the used area; this probably can't fail,
4948 * but it's cheap to check.
4949 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00004950 if (_PyUnicode_Resize(&res, res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004951 goto onError;
4952
4953 Done:
4954 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004955 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 return (PyObject *)res;
4957
Tim Peters8ce9f162004-08-27 01:49:32 +00004958 Overflow:
4959 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00004960 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00004961 Py_DECREF(item);
4962 /* fall through */
4963
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004965 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004966 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004967 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968 return NULL;
4969}
4970
Tim Petersced69f82003-09-16 20:30:58 +00004971static
4972PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004973 Py_ssize_t left,
4974 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 Py_UNICODE fill)
4976{
4977 PyUnicodeObject *u;
4978
4979 if (left < 0)
4980 left = 0;
4981 if (right < 0)
4982 right = 0;
4983
Tim Peters7a29bd52001-09-12 03:03:31 +00004984 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 Py_INCREF(self);
4986 return self;
4987 }
4988
4989 u = _PyUnicode_New(left + self->length + right);
4990 if (u) {
4991 if (left)
4992 Py_UNICODE_FILL(u->str, fill, left);
4993 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4994 if (right)
4995 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4996 }
4997
4998 return u;
4999}
5000
5001#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005002 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 if (!str) \
5004 goto onError; \
5005 if (PyList_Append(list, str)) { \
5006 Py_DECREF(str); \
5007 goto onError; \
5008 } \
5009 else \
5010 Py_DECREF(str);
5011
5012static
5013PyObject *split_whitespace(PyUnicodeObject *self,
5014 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005015 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005017 register Py_ssize_t i;
5018 register Py_ssize_t j;
5019 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 PyObject *str;
5021
5022 for (i = j = 0; i < len; ) {
5023 /* find a token */
5024 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5025 i++;
5026 j = i;
5027 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5028 i++;
5029 if (j < i) {
5030 if (maxcount-- <= 0)
5031 break;
5032 SPLIT_APPEND(self->str, j, i);
5033 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5034 i++;
5035 j = i;
5036 }
5037 }
5038 if (j < len) {
5039 SPLIT_APPEND(self->str, j, len);
5040 }
5041 return list;
5042
5043 onError:
5044 Py_DECREF(list);
5045 return NULL;
5046}
5047
5048PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00005049 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005051 register Py_ssize_t i;
5052 register Py_ssize_t j;
5053 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 PyObject *list;
5055 PyObject *str;
5056 Py_UNICODE *data;
5057
5058 string = PyUnicode_FromObject(string);
5059 if (string == NULL)
5060 return NULL;
5061 data = PyUnicode_AS_UNICODE(string);
5062 len = PyUnicode_GET_SIZE(string);
5063
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 list = PyList_New(0);
5065 if (!list)
5066 goto onError;
5067
5068 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005069 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005070
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 /* Find a line and append it */
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005072 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074
5075 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00005076 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 if (i < len) {
5078 if (data[i] == '\r' && i + 1 < len &&
5079 data[i+1] == '\n')
5080 i += 2;
5081 else
5082 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00005083 if (keepends)
5084 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 }
Guido van Rossum86662912000-04-11 15:38:46 +00005086 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 j = i;
5088 }
5089 if (j < len) {
5090 SPLIT_APPEND(data, j, len);
5091 }
5092
5093 Py_DECREF(string);
5094 return list;
5095
5096 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005097 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 Py_DECREF(string);
5099 return NULL;
5100}
5101
Tim Petersced69f82003-09-16 20:30:58 +00005102static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103PyObject *split_char(PyUnicodeObject *self,
5104 PyObject *list,
5105 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005106 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005108 register Py_ssize_t i;
5109 register Py_ssize_t j;
5110 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 PyObject *str;
5112
5113 for (i = j = 0; i < len; ) {
5114 if (self->str[i] == ch) {
5115 if (maxcount-- <= 0)
5116 break;
5117 SPLIT_APPEND(self->str, j, i);
5118 i = j = i + 1;
5119 } else
5120 i++;
5121 }
5122 if (j <= len) {
5123 SPLIT_APPEND(self->str, j, len);
5124 }
5125 return list;
5126
5127 onError:
5128 Py_DECREF(list);
5129 return NULL;
5130}
5131
Tim Petersced69f82003-09-16 20:30:58 +00005132static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133PyObject *split_substring(PyUnicodeObject *self,
5134 PyObject *list,
5135 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005136 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005138 register Py_ssize_t i;
5139 register Py_ssize_t j;
5140 Py_ssize_t len = self->length;
5141 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 PyObject *str;
5143
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005144 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 if (Py_UNICODE_MATCH(self, i, substring)) {
5146 if (maxcount-- <= 0)
5147 break;
5148 SPLIT_APPEND(self->str, j, i);
5149 i = j = i + sublen;
5150 } else
5151 i++;
5152 }
5153 if (j <= len) {
5154 SPLIT_APPEND(self->str, j, len);
5155 }
5156 return list;
5157
5158 onError:
5159 Py_DECREF(list);
5160 return NULL;
5161}
5162
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005163static
5164PyObject *rsplit_whitespace(PyUnicodeObject *self,
5165 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005166 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005168 register Py_ssize_t i;
5169 register Py_ssize_t j;
5170 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005171 PyObject *str;
5172
5173 for (i = j = len - 1; i >= 0; ) {
5174 /* find a token */
5175 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5176 i--;
5177 j = i;
5178 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5179 i--;
5180 if (j > i) {
5181 if (maxcount-- <= 0)
5182 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005183 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005184 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5185 i--;
5186 j = i;
5187 }
5188 }
5189 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005190 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005191 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005192 if (PyList_Reverse(list) < 0)
5193 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005194 return list;
5195
5196 onError:
5197 Py_DECREF(list);
5198 return NULL;
5199}
5200
5201static
5202PyObject *rsplit_char(PyUnicodeObject *self,
5203 PyObject *list,
5204 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005205 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005207 register Py_ssize_t i;
5208 register Py_ssize_t j;
5209 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005210 PyObject *str;
5211
5212 for (i = j = len - 1; i >= 0; ) {
5213 if (self->str[i] == ch) {
5214 if (maxcount-- <= 0)
5215 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005216 SPLIT_APPEND(self->str, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005217 j = i = i - 1;
5218 } else
5219 i--;
5220 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005221 if (j >= -1) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005222 SPLIT_APPEND(self->str, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005223 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005224 if (PyList_Reverse(list) < 0)
5225 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005226 return list;
5227
5228 onError:
5229 Py_DECREF(list);
5230 return NULL;
5231}
5232
5233static
5234PyObject *rsplit_substring(PyUnicodeObject *self,
5235 PyObject *list,
5236 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005237 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 register Py_ssize_t i;
5240 register Py_ssize_t j;
5241 Py_ssize_t len = self->length;
5242 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005243 PyObject *str;
5244
5245 for (i = len - sublen, j = len; i >= 0; ) {
5246 if (Py_UNICODE_MATCH(self, i, substring)) {
5247 if (maxcount-- <= 0)
5248 break;
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005249 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005250 j = i;
5251 i -= sublen;
5252 } else
5253 i--;
5254 }
5255 if (j >= 0) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005256 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005257 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005258 if (PyList_Reverse(list) < 0)
5259 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005260 return list;
5261
5262 onError:
5263 Py_DECREF(list);
5264 return NULL;
5265}
5266
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267#undef SPLIT_APPEND
5268
5269static
5270PyObject *split(PyUnicodeObject *self,
5271 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005272 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273{
5274 PyObject *list;
5275
5276 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005277 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
5279 list = PyList_New(0);
5280 if (!list)
5281 return NULL;
5282
5283 if (substring == NULL)
5284 return split_whitespace(self,list,maxcount);
5285
5286 else if (substring->length == 1)
5287 return split_char(self,list,substring->str[0],maxcount);
5288
5289 else if (substring->length == 0) {
5290 Py_DECREF(list);
5291 PyErr_SetString(PyExc_ValueError, "empty separator");
5292 return NULL;
5293 }
5294 else
5295 return split_substring(self,list,substring,maxcount);
5296}
5297
Tim Petersced69f82003-09-16 20:30:58 +00005298static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005299PyObject *rsplit(PyUnicodeObject *self,
5300 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005301 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005302{
5303 PyObject *list;
5304
5305 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005306 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005307
5308 list = PyList_New(0);
5309 if (!list)
5310 return NULL;
5311
5312 if (substring == NULL)
5313 return rsplit_whitespace(self,list,maxcount);
5314
5315 else if (substring->length == 1)
5316 return rsplit_char(self,list,substring->str[0],maxcount);
5317
5318 else if (substring->length == 0) {
5319 Py_DECREF(list);
5320 PyErr_SetString(PyExc_ValueError, "empty separator");
5321 return NULL;
5322 }
5323 else
5324 return rsplit_substring(self,list,substring,maxcount);
5325}
5326
5327static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328PyObject *replace(PyUnicodeObject *self,
5329 PyUnicodeObject *str1,
5330 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005331 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332{
5333 PyUnicodeObject *u;
5334
5335 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005336 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337
Fredrik Lundh347ee272006-05-24 16:35:18 +00005338 if (str1->length == str2->length) {
5339 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005340 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005341 if (str1->length == 1) {
5342 /* replace characters */
5343 Py_UNICODE u1, u2;
5344 if (!findchar(self->str, self->length, str1->str[0]))
5345 goto nothing;
5346 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5347 if (!u)
5348 return NULL;
5349 Py_UNICODE_COPY(u->str, self->str, self->length);
5350 u1 = str1->str[0];
5351 u2 = str2->str[0];
5352 for (i = 0; i < u->length; i++)
5353 if (u->str[i] == u1) {
5354 if (--maxcount < 0)
5355 break;
5356 u->str[i] = u2;
5357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005359 i = fastsearch(
5360 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005362 if (i < 0)
5363 goto nothing;
5364 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5365 if (!u)
5366 return NULL;
5367 Py_UNICODE_COPY(u->str, self->str, self->length);
5368 while (i <= self->length - str1->length)
5369 if (Py_UNICODE_MATCH(self, i, str1)) {
5370 if (--maxcount < 0)
5371 break;
5372 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5373 i += str1->length;
5374 } else
5375 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005378
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005379 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005380 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 Py_UNICODE *p;
5382
5383 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005384 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 if (n > maxcount)
5386 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005387 if (n == 0)
5388 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005389 /* new_size = self->length + n * (str2->length - str1->length)); */
5390 delta = (str2->length - str1->length);
5391 if (delta == 0) {
5392 new_size = self->length;
5393 } else {
5394 product = n * (str2->length - str1->length);
5395 if ((product / (str2->length - str1->length)) != n) {
5396 PyErr_SetString(PyExc_OverflowError,
5397 "replace string is too long");
5398 return NULL;
5399 }
5400 new_size = self->length + product;
5401 if (new_size < 0) {
5402 PyErr_SetString(PyExc_OverflowError,
5403 "replace string is too long");
5404 return NULL;
5405 }
5406 }
5407 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005408 if (!u)
5409 return NULL;
5410 i = 0;
5411 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005412 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005413 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005414 while (n-- > 0) {
5415 /* look for next match */
5416 j = i;
5417 while (j <= e) {
5418 if (Py_UNICODE_MATCH(self, j, str1))
5419 break;
5420 j++;
5421 }
5422 if (j > i) {
5423 if (j > e)
5424 break;
5425 /* copy unchanged part [i:j] */
5426 Py_UNICODE_COPY(p, self->str+i, j-i);
5427 p += j - i;
5428 }
5429 /* copy substitution string */
5430 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005431 Py_UNICODE_COPY(p, str2->str, str2->length);
5432 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005433 }
5434 i = j + str1->length;
5435 }
5436 if (i < self->length)
5437 /* copy tail [i:] */
5438 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005439 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005440 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005441 while (n > 0) {
5442 Py_UNICODE_COPY(p, str2->str, str2->length);
5443 p += str2->length;
5444 if (--n <= 0)
5445 break;
5446 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005448 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 }
5450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005452
5453nothing:
5454 /* nothing to replace; return original string (when possible) */
5455 if (PyUnicode_CheckExact(self)) {
5456 Py_INCREF(self);
5457 return (PyObject *) self;
5458 }
5459 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460}
5461
5462/* --- Unicode Object Methods --------------------------------------------- */
5463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005464PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465"S.title() -> unicode\n\
5466\n\
5467Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005468characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
5470static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005471unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 return fixup(self, fixtitle);
5474}
5475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005476PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477"S.capitalize() -> unicode\n\
5478\n\
5479Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005480have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481
5482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005483unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 return fixup(self, fixcapitalize);
5486}
5487
5488#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005489PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490"S.capwords() -> unicode\n\
5491\n\
5492Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005493normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
5495static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005496unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497{
5498 PyObject *list;
5499 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005500 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 /* Split into words */
5503 list = split(self, NULL, -1);
5504 if (!list)
5505 return NULL;
5506
5507 /* Capitalize each word */
5508 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5509 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5510 fixcapitalize);
5511 if (item == NULL)
5512 goto onError;
5513 Py_DECREF(PyList_GET_ITEM(list, i));
5514 PyList_SET_ITEM(list, i, item);
5515 }
5516
5517 /* Join the words to form a new string */
5518 item = PyUnicode_Join(NULL, list);
5519
5520onError:
5521 Py_DECREF(list);
5522 return (PyObject *)item;
5523}
5524#endif
5525
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005526/* Argument converter. Coerces to a single unicode character */
5527
5528static int
5529convert_uc(PyObject *obj, void *addr)
5530{
5531 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5532 PyObject *uniobj;
5533 Py_UNICODE *unistr;
5534
5535 uniobj = PyUnicode_FromObject(obj);
5536 if (uniobj == NULL) {
5537 PyErr_SetString(PyExc_TypeError,
5538 "The fill character cannot be converted to Unicode");
5539 return 0;
5540 }
5541 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5542 PyErr_SetString(PyExc_TypeError,
5543 "The fill character must be exactly one character long");
5544 Py_DECREF(uniobj);
5545 return 0;
5546 }
5547 unistr = PyUnicode_AS_UNICODE(uniobj);
5548 *fillcharloc = unistr[0];
5549 Py_DECREF(uniobj);
5550 return 1;
5551}
5552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005553PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005554"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005556Return S centered in a Unicode string of length width. Padding is\n\
5557done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
5559static PyObject *
5560unicode_center(PyUnicodeObject *self, PyObject *args)
5561{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t marg, left;
5563 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005564 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
Thomas Woutersde017742006-02-16 19:34:37 +00005566 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 return NULL;
5568
Tim Peters7a29bd52001-09-12 03:03:31 +00005569 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 Py_INCREF(self);
5571 return (PyObject*) self;
5572 }
5573
5574 marg = width - self->length;
5575 left = marg / 2 + (marg & width & 1);
5576
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005577 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578}
5579
Marc-André Lemburge5034372000-08-08 08:04:29 +00005580#if 0
5581
5582/* This code should go into some future Unicode collation support
5583 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00005584 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00005585
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005586/* speedy UTF-16 code point order comparison */
5587/* gleaned from: */
5588/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5589
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005590static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005591{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005592 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00005593 0, 0, 0, 0, 0, 0, 0, 0,
5594 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00005595 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005596};
5597
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598static int
5599unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5600{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005601 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 Py_UNICODE *s1 = str1->str;
5604 Py_UNICODE *s2 = str2->str;
5605
5606 len1 = str1->length;
5607 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005608
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005610 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005611
5612 c1 = *s1++;
5613 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00005614
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005615 if (c1 > (1<<11) * 26)
5616 c1 += utf16Fixup[c1>>11];
5617 if (c2 > (1<<11) * 26)
5618 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005619 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00005620
5621 if (c1 != c2)
5622 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00005623
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00005624 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 }
5626
5627 return (len1 < len2) ? -1 : (len1 != len2);
5628}
5629
Marc-André Lemburge5034372000-08-08 08:04:29 +00005630#else
5631
5632static int
5633unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5634{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005636
5637 Py_UNICODE *s1 = str1->str;
5638 Py_UNICODE *s2 = str2->str;
5639
5640 len1 = str1->length;
5641 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00005642
Marc-André Lemburge5034372000-08-08 08:04:29 +00005643 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00005644 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00005645
Fredrik Lundh45714e92001-06-26 16:39:36 +00005646 c1 = *s1++;
5647 c2 = *s2++;
5648
5649 if (c1 != c2)
5650 return (c1 < c2) ? -1 : 1;
5651
Marc-André Lemburge5034372000-08-08 08:04:29 +00005652 len1--; len2--;
5653 }
5654
5655 return (len1 < len2) ? -1 : (len1 != len2);
5656}
5657
5658#endif
5659
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660int PyUnicode_Compare(PyObject *left,
5661 PyObject *right)
5662{
5663 PyUnicodeObject *u = NULL, *v = NULL;
5664 int result;
5665
5666 /* Coerce the two arguments */
5667 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5668 if (u == NULL)
5669 goto onError;
5670 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5671 if (v == NULL)
5672 goto onError;
5673
Thomas Wouters7e474022000-07-16 12:04:32 +00005674 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (v == u) {
5676 Py_DECREF(u);
5677 Py_DECREF(v);
5678 return 0;
5679 }
5680
5681 result = unicode_compare(u, v);
5682
5683 Py_DECREF(u);
5684 Py_DECREF(v);
5685 return result;
5686
5687onError:
5688 Py_XDECREF(u);
5689 Py_XDECREF(v);
5690 return -1;
5691}
5692
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00005693PyObject *PyUnicode_RichCompare(PyObject *left,
5694 PyObject *right,
5695 int op)
5696{
5697 int result;
5698
5699 result = PyUnicode_Compare(left, right);
5700 if (result == -1 && PyErr_Occurred())
5701 goto onError;
5702
5703 /* Convert the return value to a Boolean */
5704 switch (op) {
5705 case Py_EQ:
5706 result = (result == 0);
5707 break;
5708 case Py_NE:
5709 result = (result != 0);
5710 break;
5711 case Py_LE:
5712 result = (result <= 0);
5713 break;
5714 case Py_GE:
5715 result = (result >= 0);
5716 break;
5717 case Py_LT:
5718 result = (result == -1);
5719 break;
5720 case Py_GT:
5721 result = (result == 1);
5722 break;
5723 }
5724 return PyBool_FromLong(result);
5725
5726 onError:
5727
5728 /* Standard case
5729
5730 Type errors mean that PyUnicode_FromObject() could not convert
5731 one of the arguments (usually the right hand side) to Unicode,
5732 ie. we can't handle the comparison request. However, it is
5733 possible that the other object knows a comparison method, which
5734 is why we return Py_NotImplemented to give the other object a
5735 chance.
5736
5737 */
5738 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5739 PyErr_Clear();
5740 Py_INCREF(Py_NotImplemented);
5741 return Py_NotImplemented;
5742 }
5743 if (op != Py_EQ && op != Py_NE)
5744 return NULL;
5745
5746 /* Equality comparison.
5747
5748 This is a special case: we silence any PyExc_UnicodeDecodeError
5749 and instead turn it into a PyErr_UnicodeWarning.
5750
5751 */
5752 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5753 return NULL;
5754 PyErr_Clear();
5755 if (PyErr_Warn(PyExc_UnicodeWarning,
5756 (op == Py_EQ) ?
5757 "Unicode equal comparison "
5758 "failed to convert both arguments to Unicode - "
5759 "interpreting them as being unequal" :
5760 "Unicode unequal comparison "
5761 "failed to convert both arguments to Unicode - "
5762 "interpreting them as being unequal"
5763 ) < 0)
5764 return NULL;
5765 result = (op == Py_NE);
5766 return PyBool_FromLong(result);
5767}
5768
Guido van Rossum403d68b2000-03-13 15:55:09 +00005769int PyUnicode_Contains(PyObject *container,
5770 PyObject *element)
5771{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005772 PyObject *str, *sub;
5773 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005774
5775 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005776 sub = PyUnicode_FromObject(element);
5777 if (!sub) {
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005778 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00005779 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00005780 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00005781 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00005782
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005783 str = PyUnicode_FromObject(container);
5784 if (!str) {
5785 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00005786 return -1;
5787 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005788
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005789 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00005790
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005791 Py_DECREF(str);
5792 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00005793
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005794 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00005795}
5796
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797/* Concat to string or Unicode object giving a new Unicode object. */
5798
5799PyObject *PyUnicode_Concat(PyObject *left,
5800 PyObject *right)
5801{
5802 PyUnicodeObject *u = NULL, *v = NULL, *w;
5803
5804 /* Coerce the two arguments */
5805 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5806 if (u == NULL)
5807 goto onError;
5808 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5809 if (v == NULL)
5810 goto onError;
5811
5812 /* Shortcuts */
5813 if (v == unicode_empty) {
5814 Py_DECREF(v);
5815 return (PyObject *)u;
5816 }
5817 if (u == unicode_empty) {
5818 Py_DECREF(u);
5819 return (PyObject *)v;
5820 }
5821
5822 /* Concat the two Unicode strings */
5823 w = _PyUnicode_New(u->length + v->length);
5824 if (w == NULL)
5825 goto onError;
5826 Py_UNICODE_COPY(w->str, u->str, u->length);
5827 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5828
5829 Py_DECREF(u);
5830 Py_DECREF(v);
5831 return (PyObject *)w;
5832
5833onError:
5834 Py_XDECREF(u);
5835 Py_XDECREF(v);
5836 return NULL;
5837}
5838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005839PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840"S.count(sub[, start[, end]]) -> int\n\
5841\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00005842Return the number of non-overlapping occurrences of substring sub in\n\
5843Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005844interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845
5846static PyObject *
5847unicode_count(PyUnicodeObject *self, PyObject *args)
5848{
5849 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005850 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005851 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 PyObject *result;
5853
Guido van Rossumb8872e62000-05-09 14:14:27 +00005854 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5855 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 return NULL;
5857
5858 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005859 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 if (substring == NULL)
5861 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005862
Fredrik Lundhc8162812006-05-26 19:33:03 +00005863 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005865 result = PyInt_FromSsize_t(
5866 stringlib_count(self->str + start, end - start,
5867 substring->str, substring->length)
5868 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005871
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 return result;
5873}
5874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005875PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005876"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005878Encodes S using the codec registered for encoding. encoding defaults\n\
5879to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005880handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5882'xmlcharrefreplace' as well as any other name registered with\n\
5883codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884
5885static PyObject *
5886unicode_encode(PyUnicodeObject *self, PyObject *args)
5887{
5888 char *encoding = NULL;
5889 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005890 PyObject *v;
5891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5893 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005894 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005895 if (v == NULL)
5896 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005897 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5898 PyErr_Format(PyExc_TypeError,
5899 "encoder did not return a string/unicode object "
5900 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005901 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005902 Py_DECREF(v);
5903 return NULL;
5904 }
5905 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005906
5907 onError:
5908 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005909}
5910
5911PyDoc_STRVAR(decode__doc__,
5912"S.decode([encoding[,errors]]) -> string or unicode\n\
5913\n\
5914Decodes S using the codec registered for encoding. encoding defaults\n\
5915to the default encoding. errors may be given to set a different error\n\
5916handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5917a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5918as well as any other name registerd with codecs.register_error that is\n\
5919able to handle UnicodeDecodeErrors.");
5920
5921static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005922unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005923{
5924 char *encoding = NULL;
5925 char *errors = NULL;
5926 PyObject *v;
5927
5928 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5929 return NULL;
5930 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005931 if (v == NULL)
5932 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005933 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5934 PyErr_Format(PyExc_TypeError,
5935 "decoder did not return a string/unicode object "
5936 "(type=%.400s)",
Martin v. Löwis68192102007-07-21 06:55:02 +00005937 Py_Type(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005938 Py_DECREF(v);
5939 return NULL;
5940 }
5941 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005942
5943 onError:
5944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945}
5946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005947PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948"S.expandtabs([tabsize]) -> unicode\n\
5949\n\
5950Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005951If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
5953static PyObject*
5954unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5955{
5956 Py_UNICODE *e;
5957 Py_UNICODE *p;
5958 Py_UNICODE *q;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005959 Py_ssize_t i, j, old_j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 PyUnicodeObject *u;
5961 int tabsize = 8;
5962
5963 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5964 return NULL;
5965
Thomas Wouters7e474022000-07-16 12:04:32 +00005966 /* First pass: determine size of output string */
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005967 i = j = old_j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 e = self->str + self->length;
5969 for (p = self->str; p < e; p++)
5970 if (*p == '\t') {
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005971 if (tabsize > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 j += tabsize - (j % tabsize);
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005973 if (old_j > j) {
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005974 PyErr_SetString(PyExc_OverflowError,
5975 "new string is too long");
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005976 return NULL;
5977 }
5978 old_j = j;
5979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 }
5981 else {
5982 j++;
5983 if (*p == '\n' || *p == '\r') {
5984 i += j;
Neal Norwitz5c9a81a2007-06-11 02:16:10 +00005985 old_j = j = 0;
5986 if (i < 0) {
5987 PyErr_SetString(PyExc_OverflowError,
5988 "new string is too long");
5989 return NULL;
5990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 }
5992 }
5993
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00005994 if ((i + j) < 0) {
5995 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5996 return NULL;
5997 }
5998
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 /* Second pass: create output string and fill it */
6000 u = _PyUnicode_New(i + j);
6001 if (!u)
6002 return NULL;
6003
6004 j = 0;
6005 q = u->str;
6006
6007 for (p = self->str; p < e; p++)
6008 if (*p == '\t') {
6009 if (tabsize > 0) {
6010 i = tabsize - (j % tabsize);
6011 j += i;
6012 while (i--)
6013 *q++ = ' ';
6014 }
6015 }
6016 else {
6017 j++;
6018 *q++ = *p;
6019 if (*p == '\n' || *p == '\r')
6020 j = 0;
6021 }
6022
6023 return (PyObject*) u;
6024}
6025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006026PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027"S.find(sub [,start [,end]]) -> int\n\
6028\n\
6029Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006030such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031arguments start and end are interpreted as in slice notation.\n\
6032\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006033Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
6035static PyObject *
6036unicode_find(PyUnicodeObject *self, PyObject *args)
6037{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006038 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006039 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006040 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006041 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Guido van Rossumb8872e62000-05-09 14:14:27 +00006043 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6044 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006046 substring = PyUnicode_FromObject(substring);
6047 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 return NULL;
6049
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006050 result = stringlib_find_slice(
6051 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6052 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6053 start, end
6054 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
6056 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006057
6058 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059}
6060
6061static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063{
6064 if (index < 0 || index >= self->length) {
6065 PyErr_SetString(PyExc_IndexError, "string index out of range");
6066 return NULL;
6067 }
6068
6069 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6070}
6071
6072static long
6073unicode_hash(PyUnicodeObject *self)
6074{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006075 /* Since Unicode objects compare equal to their ASCII string
6076 counterparts, they should use the individual character values
6077 as basis for their hash value. This is needed to assure that
6078 strings and Unicode objects behave in the same way as
6079 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
Martin v. Löwis18e16552006-02-15 17:27:45 +00006081 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006082 register Py_UNICODE *p;
6083 register long x;
6084
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 if (self->hash != -1)
6086 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006087 len = PyUnicode_GET_SIZE(self);
6088 p = PyUnicode_AS_UNICODE(self);
6089 x = *p << 7;
6090 while (--len >= 0)
6091 x = (1000003*x) ^ *p++;
6092 x ^= PyUnicode_GET_SIZE(self);
6093 if (x == -1)
6094 x = -2;
6095 self->hash = x;
6096 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097}
6098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006099PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100"S.index(sub [,start [,end]]) -> int\n\
6101\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
6104static PyObject *
6105unicode_index(PyUnicodeObject *self, PyObject *args)
6106{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006107 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006108 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006109 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006110 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
Guido van Rossumb8872e62000-05-09 14:14:27 +00006112 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6113 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006115 substring = PyUnicode_FromObject(substring);
6116 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 return NULL;
6118
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006119 result = stringlib_find_slice(
6120 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6121 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6122 start, end
6123 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
6125 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 if (result < 0) {
6128 PyErr_SetString(PyExc_ValueError, "substring not found");
6129 return NULL;
6130 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006131
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133}
6134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006135PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006136"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006138Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006139at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
6141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006142unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
6144 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6145 register const Py_UNICODE *e;
6146 int cased;
6147
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 /* Shortcut for single character strings */
6149 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006150 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006152 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006153 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006154 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006155
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 e = p + PyUnicode_GET_SIZE(self);
6157 cased = 0;
6158 for (; p < e; p++) {
6159 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006162 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 else if (!cased && Py_UNICODE_ISLOWER(ch))
6164 cased = 1;
6165 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006166 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167}
6168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006169PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006170"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006172Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006173at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
6175static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006176unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177{
6178 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6179 register const Py_UNICODE *e;
6180 int cased;
6181
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 /* Shortcut for single character strings */
6183 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006184 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006186 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006187 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006188 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006189
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 e = p + PyUnicode_GET_SIZE(self);
6191 cased = 0;
6192 for (; p < e; p++) {
6193 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006194
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006196 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 else if (!cased && Py_UNICODE_ISUPPER(ch))
6198 cased = 1;
6199 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006200 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201}
6202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006203PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006204"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006206Return True if S is a titlecased string and there is at least one\n\
6207character in S, i.e. upper- and titlecase characters may only\n\
6208follow uncased characters and lowercase characters only cased ones.\n\
6209Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210
6211static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006212unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213{
6214 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6215 register const Py_UNICODE *e;
6216 int cased, previous_is_cased;
6217
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 /* Shortcut for single character strings */
6219 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006220 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6221 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006223 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006224 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006225 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 e = p + PyUnicode_GET_SIZE(self);
6228 cased = 0;
6229 previous_is_cased = 0;
6230 for (; p < e; p++) {
6231 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006232
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6234 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006235 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 previous_is_cased = 1;
6237 cased = 1;
6238 }
6239 else if (Py_UNICODE_ISLOWER(ch)) {
6240 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006241 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 previous_is_cased = 1;
6243 cased = 1;
6244 }
6245 else
6246 previous_is_cased = 0;
6247 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006248 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249}
6250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006251PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006252"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006254Return True if all characters in S are whitespace\n\
6255and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256
6257static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006258unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259{
6260 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6261 register const Py_UNICODE *e;
6262
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 /* Shortcut for single character strings */
6264 if (PyUnicode_GET_SIZE(self) == 1 &&
6265 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006266 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006268 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006269 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006270 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006271
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 e = p + PyUnicode_GET_SIZE(self);
6273 for (; p < e; p++) {
6274 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006275 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006277 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278}
6279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006280PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006281"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006282\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006283Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006284and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006285
6286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006287unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006288{
6289 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6290 register const Py_UNICODE *e;
6291
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006292 /* Shortcut for single character strings */
6293 if (PyUnicode_GET_SIZE(self) == 1 &&
6294 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006295 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006296
6297 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006298 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006299 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006300
6301 e = p + PyUnicode_GET_SIZE(self);
6302 for (; p < e; p++) {
6303 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006304 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006305 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006306 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006307}
6308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006309PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006310"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006311\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006312Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006313and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006314
6315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006316unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006317{
6318 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6319 register const Py_UNICODE *e;
6320
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006321 /* Shortcut for single character strings */
6322 if (PyUnicode_GET_SIZE(self) == 1 &&
6323 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006324 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006325
6326 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006327 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006328 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006329
6330 e = p + PyUnicode_GET_SIZE(self);
6331 for (; p < e; p++) {
6332 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006333 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006334 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006335 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006336}
6337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006338PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006339"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006341Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006342False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006345unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
6347 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6348 register const Py_UNICODE *e;
6349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 /* Shortcut for single character strings */
6351 if (PyUnicode_GET_SIZE(self) == 1 &&
6352 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006353 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006355 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006356 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006357 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006358
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 e = p + PyUnicode_GET_SIZE(self);
6360 for (; p < e; p++) {
6361 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006362 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006364 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365}
6366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006367PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006368"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006370Return True if all characters in S are digits\n\
6371and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
6373static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006374unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
6376 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6377 register const Py_UNICODE *e;
6378
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 /* Shortcut for single character strings */
6380 if (PyUnicode_GET_SIZE(self) == 1 &&
6381 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006384 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006385 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006386 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006387
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 e = p + PyUnicode_GET_SIZE(self);
6389 for (; p < e; p++) {
6390 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006391 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006393 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394}
6395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006396PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006397"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006399Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006400False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401
6402static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006403unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
6405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6406 register const Py_UNICODE *e;
6407
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 /* Shortcut for single character strings */
6409 if (PyUnicode_GET_SIZE(self) == 1 &&
6410 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006411 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006413 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006414 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00006415 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006416
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 e = p + PyUnicode_GET_SIZE(self);
6418 for (; p < e; p++) {
6419 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00006420 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006422 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423}
6424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006425PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426"S.join(sequence) -> unicode\n\
6427\n\
6428Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006429sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430
6431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006432unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006434 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435}
6436
Martin v. Löwis18e16552006-02-15 17:27:45 +00006437static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438unicode_length(PyUnicodeObject *self)
6439{
6440 return self->length;
6441}
6442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006443PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006444"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445\n\
6446Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006447done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448
6449static PyObject *
6450unicode_ljust(PyUnicodeObject *self, PyObject *args)
6451{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006452 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006453 Py_UNICODE fillchar = ' ';
6454
Martin v. Löwis412fb672006-04-13 06:34:32 +00006455 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 return NULL;
6457
Tim Peters7a29bd52001-09-12 03:03:31 +00006458 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 Py_INCREF(self);
6460 return (PyObject*) self;
6461 }
6462
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006463 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464}
6465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006466PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467"S.lower() -> unicode\n\
6468\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006469Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470
6471static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006472unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 return fixup(self, fixlower);
6475}
6476
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006477#define LEFTSTRIP 0
6478#define RIGHTSTRIP 1
6479#define BOTHSTRIP 2
6480
6481/* Arrays indexed by above */
6482static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6483
6484#define STRIPNAME(i) (stripformat[i]+3)
6485
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006486/* externally visible for str.strip(unicode) */
6487PyObject *
6488_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6489{
6490 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006491 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006492 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6494 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006495
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006496 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6497
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006498 i = 0;
6499 if (striptype != RIGHTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006500 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6501 i++;
6502 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006503 }
6504
6505 j = len;
6506 if (striptype != LEFTSTRIP) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006507 do {
6508 j--;
6509 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6510 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006511 }
6512
6513 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006514 Py_INCREF(self);
6515 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006516 }
6517 else
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006518 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006519}
6520
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
6522static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006523do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006525 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00006526 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006527
6528 i = 0;
6529 if (striptype != RIGHTSTRIP) {
6530 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6531 i++;
6532 }
6533 }
6534
6535 j = len;
6536 if (striptype != LEFTSTRIP) {
6537 do {
6538 j--;
6539 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6540 j++;
6541 }
6542
6543 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6544 Py_INCREF(self);
6545 return (PyObject*)self;
6546 }
6547 else
6548 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549}
6550
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006551
6552static PyObject *
6553do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6554{
6555 PyObject *sep = NULL;
6556
6557 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6558 return NULL;
6559
6560 if (sep != NULL && sep != Py_None) {
6561 if (PyUnicode_Check(sep))
6562 return _PyUnicode_XStrip(self, striptype, sep);
6563 else if (PyString_Check(sep)) {
6564 PyObject *res;
6565 sep = PyUnicode_FromObject(sep);
6566 if (sep==NULL)
6567 return NULL;
6568 res = _PyUnicode_XStrip(self, striptype, sep);
6569 Py_DECREF(sep);
6570 return res;
6571 }
6572 else {
6573 PyErr_Format(PyExc_TypeError,
6574 "%s arg must be None, unicode or str",
6575 STRIPNAME(striptype));
6576 return NULL;
6577 }
6578 }
6579
6580 return do_strip(self, striptype);
6581}
6582
6583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006584PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006585"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006586\n\
6587Return a copy of the string S with leading and trailing\n\
6588whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006589If chars is given and not None, remove characters in chars instead.\n\
6590If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006591
6592static PyObject *
6593unicode_strip(PyUnicodeObject *self, PyObject *args)
6594{
6595 if (PyTuple_GET_SIZE(args) == 0)
6596 return do_strip(self, BOTHSTRIP); /* Common case */
6597 else
6598 return do_argstrip(self, BOTHSTRIP, args);
6599}
6600
6601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006603"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006604\n\
6605Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006606If chars is given and not None, remove characters in chars instead.\n\
6607If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006608
6609static PyObject *
6610unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6611{
6612 if (PyTuple_GET_SIZE(args) == 0)
6613 return do_strip(self, LEFTSTRIP); /* Common case */
6614 else
6615 return do_argstrip(self, LEFTSTRIP, args);
6616}
6617
6618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006619PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00006620"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006621\n\
6622Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00006623If chars is given and not None, remove characters in chars instead.\n\
6624If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006625
6626static PyObject *
6627unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6628{
6629 if (PyTuple_GET_SIZE(args) == 0)
6630 return do_strip(self, RIGHTSTRIP); /* Common case */
6631 else
6632 return do_argstrip(self, RIGHTSTRIP, args);
6633}
6634
6635
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006637unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638{
6639 PyUnicodeObject *u;
6640 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006641 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00006642 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643
6644 if (len < 0)
6645 len = 0;
6646
Tim Peters7a29bd52001-09-12 03:03:31 +00006647 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 /* no repeat, return original string */
6649 Py_INCREF(str);
6650 return (PyObject*) str;
6651 }
Tim Peters8f422462000-09-09 06:13:41 +00006652
6653 /* ensure # of chars needed doesn't overflow int and # of bytes
6654 * needed doesn't overflow size_t
6655 */
6656 nchars = len * str->length;
6657 if (len && nchars / len != str->length) {
6658 PyErr_SetString(PyExc_OverflowError,
6659 "repeated string is too long");
6660 return NULL;
6661 }
6662 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6663 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6664 PyErr_SetString(PyExc_OverflowError,
6665 "repeated string is too long");
6666 return NULL;
6667 }
6668 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 if (!u)
6670 return NULL;
6671
6672 p = u->str;
6673
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006674 if (str->length == 1 && len > 0) {
6675 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006676 } else {
Tim Peters1bacc642006-05-23 05:47:16 +00006677 Py_ssize_t done = 0; /* number of characters copied this far */
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006678 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00006679 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00006680 done = str->length;
6681 }
6682 while (done < nchars) {
6683 int n = (done <= nchars-done) ? done : nchars-done;
6684 Py_UNICODE_COPY(p+done, p, n);
6685 done += n;
6686 }
6687 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689 return (PyObject*) u;
6690}
6691
6692PyObject *PyUnicode_Replace(PyObject *obj,
6693 PyObject *subobj,
6694 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006695 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696{
6697 PyObject *self;
6698 PyObject *str1;
6699 PyObject *str2;
6700 PyObject *result;
6701
6702 self = PyUnicode_FromObject(obj);
6703 if (self == NULL)
6704 return NULL;
6705 str1 = PyUnicode_FromObject(subobj);
6706 if (str1 == NULL) {
6707 Py_DECREF(self);
6708 return NULL;
6709 }
6710 str2 = PyUnicode_FromObject(replobj);
6711 if (str2 == NULL) {
6712 Py_DECREF(self);
6713 Py_DECREF(str1);
6714 return NULL;
6715 }
Tim Petersced69f82003-09-16 20:30:58 +00006716 result = replace((PyUnicodeObject *)self,
6717 (PyUnicodeObject *)str1,
6718 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 maxcount);
6720 Py_DECREF(self);
6721 Py_DECREF(str1);
6722 Py_DECREF(str2);
6723 return result;
6724}
6725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006726PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727"S.replace (old, new[, maxsplit]) -> unicode\n\
6728\n\
6729Return a copy of S with all occurrences of substring\n\
6730old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006731given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732
6733static PyObject*
6734unicode_replace(PyUnicodeObject *self, PyObject *args)
6735{
6736 PyUnicodeObject *str1;
6737 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006738 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 PyObject *result;
6740
Martin v. Löwis18e16552006-02-15 17:27:45 +00006741 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 return NULL;
6743 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6744 if (str1 == NULL)
6745 return NULL;
6746 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006747 if (str2 == NULL) {
6748 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00006750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751
6752 result = replace(self, str1, str2, maxcount);
6753
6754 Py_DECREF(str1);
6755 Py_DECREF(str2);
6756 return result;
6757}
6758
6759static
6760PyObject *unicode_repr(PyObject *unicode)
6761{
6762 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6763 PyUnicode_GET_SIZE(unicode),
6764 1);
6765}
6766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006767PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768"S.rfind(sub [,start [,end]]) -> int\n\
6769\n\
6770Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006771such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772arguments start and end are interpreted as in slice notation.\n\
6773\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006774Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
6776static PyObject *
6777unicode_rfind(PyUnicodeObject *self, PyObject *args)
6778{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006779 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006780 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006781 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006782 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
Guido van Rossumb8872e62000-05-09 14:14:27 +00006784 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6785 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006787 substring = PyUnicode_FromObject(substring);
6788 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 return NULL;
6790
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006791 result = stringlib_rfind_slice(
6792 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6793 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6794 start, end
6795 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
6797 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006798
6799 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800}
6801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006802PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803"S.rindex(sub [,start [,end]]) -> int\n\
6804\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006805Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
6807static PyObject *
6808unicode_rindex(PyUnicodeObject *self, PyObject *args)
6809{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006810 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006811 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006812 Py_ssize_t end = PY_SSIZE_T_MAX;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006813 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
Guido van Rossumb8872e62000-05-09 14:14:27 +00006815 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6816 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 return NULL;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006818 substring = PyUnicode_FromObject(substring);
6819 if (!substring)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 return NULL;
6821
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006822 result = stringlib_rfind_slice(
6823 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6824 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6825 start, end
6826 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
6828 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006829
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 if (result < 0) {
6831 PyErr_SetString(PyExc_ValueError, "substring not found");
6832 return NULL;
6833 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006834 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835}
6836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006838"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839\n\
6840Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006841done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842
6843static PyObject *
6844unicode_rjust(PyUnicodeObject *self, PyObject *args)
6845{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006846 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006847 Py_UNICODE fillchar = ' ';
6848
Martin v. Löwis412fb672006-04-13 06:34:32 +00006849 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 return NULL;
6851
Tim Peters7a29bd52001-09-12 03:03:31 +00006852 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 Py_INCREF(self);
6854 return (PyObject*) self;
6855 }
6856
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006857 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858}
6859
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00006861unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
6863 /* standard clamping */
6864 if (start < 0)
6865 start = 0;
6866 if (end < 0)
6867 end = 0;
6868 if (end > self->length)
6869 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006870 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 /* full slice, return original string */
6872 Py_INCREF(self);
6873 return (PyObject*) self;
6874 }
6875 if (start > end)
6876 start = end;
6877 /* copy slice */
6878 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6879 end - start);
6880}
6881
6882PyObject *PyUnicode_Split(PyObject *s,
6883 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006884 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885{
6886 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006887
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 s = PyUnicode_FromObject(s);
6889 if (s == NULL)
6890 return NULL;
6891 if (sep != NULL) {
6892 sep = PyUnicode_FromObject(sep);
6893 if (sep == NULL) {
6894 Py_DECREF(s);
6895 return NULL;
6896 }
6897 }
6898
6899 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6900
6901 Py_DECREF(s);
6902 Py_XDECREF(sep);
6903 return result;
6904}
6905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006906PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907"S.split([sep [,maxsplit]]) -> list of strings\n\
6908\n\
6909Return a list of the words in S, using sep as the\n\
6910delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006911splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006912any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913
6914static PyObject*
6915unicode_split(PyUnicodeObject *self, PyObject *args)
6916{
6917 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006918 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
Martin v. Löwis18e16552006-02-15 17:27:45 +00006920 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 return NULL;
6922
6923 if (substring == Py_None)
6924 return split(self, NULL, maxcount);
6925 else if (PyUnicode_Check(substring))
6926 return split(self, (PyUnicodeObject *)substring, maxcount);
6927 else
6928 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6929}
6930
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006931PyObject *
6932PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6933{
6934 PyObject* str_obj;
6935 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006936 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00006937
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006938 str_obj = PyUnicode_FromObject(str_in);
6939 if (!str_obj)
6940 return NULL;
6941 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00006942 if (!sep_obj) {
6943 Py_DECREF(str_obj);
6944 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006945 }
6946
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006947 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00006948 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6949 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6950 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006951
Fredrik Lundhb9479482006-05-26 17:22:38 +00006952 Py_DECREF(sep_obj);
6953 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006954
6955 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006956}
6957
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006958
6959PyObject *
6960PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6961{
6962 PyObject* str_obj;
6963 PyObject* sep_obj;
6964 PyObject* out;
6965
6966 str_obj = PyUnicode_FromObject(str_in);
6967 if (!str_obj)
6968 return NULL;
6969 sep_obj = PyUnicode_FromObject(sep_in);
6970 if (!sep_obj) {
6971 Py_DECREF(str_obj);
6972 return NULL;
6973 }
6974
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006975 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006976 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6977 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6978 );
6979
6980 Py_DECREF(sep_obj);
6981 Py_DECREF(str_obj);
6982
6983 return out;
6984}
6985
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006986PyDoc_STRVAR(partition__doc__,
6987"S.partition(sep) -> (head, sep, tail)\n\
6988\n\
6989Searches for the separator sep in S, and returns the part before it,\n\
6990the separator itself, and the part after it. If the separator is not\n\
6991found, returns S and two empty strings.");
6992
6993static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00006994unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006995{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00006996 return PyUnicode_Partition((PyObject *)self, separator);
6997}
6998
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00006999PyDoc_STRVAR(rpartition__doc__,
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007000"S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007001\n\
7002Searches for the separator sep in S, starting at the end of S, and returns\n\
7003the part before it, the separator itself, and the part after it. If the\n\
Raymond Hettingera0c95fa2006-09-04 15:32:48 +00007004separator is not found, returns two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007005
7006static PyObject*
7007unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7008{
7009 return PyUnicode_RPartition((PyObject *)self, separator);
7010}
7011
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007012PyObject *PyUnicode_RSplit(PyObject *s,
7013 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007014 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007015{
7016 PyObject *result;
7017
7018 s = PyUnicode_FromObject(s);
7019 if (s == NULL)
7020 return NULL;
7021 if (sep != NULL) {
7022 sep = PyUnicode_FromObject(sep);
7023 if (sep == NULL) {
7024 Py_DECREF(s);
7025 return NULL;
7026 }
7027 }
7028
7029 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7030
7031 Py_DECREF(s);
7032 Py_XDECREF(sep);
7033 return result;
7034}
7035
7036PyDoc_STRVAR(rsplit__doc__,
7037"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7038\n\
7039Return a list of the words in S, using sep as the\n\
7040delimiter string, starting at the end of the string and\n\
7041working to the front. If maxsplit is given, at most maxsplit\n\
7042splits are done. If sep is not specified, any whitespace string\n\
7043is a separator.");
7044
7045static PyObject*
7046unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7047{
7048 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007049 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007050
Martin v. Löwis18e16552006-02-15 17:27:45 +00007051 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007052 return NULL;
7053
7054 if (substring == Py_None)
7055 return rsplit(self, NULL, maxcount);
7056 else if (PyUnicode_Check(substring))
7057 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7058 else
7059 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7060}
7061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007062PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00007063"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064\n\
7065Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007066Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007067is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068
7069static PyObject*
7070unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7071{
Guido van Rossum86662912000-04-11 15:38:46 +00007072 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073
Guido van Rossum86662912000-04-11 15:38:46 +00007074 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 return NULL;
7076
Guido van Rossum86662912000-04-11 15:38:46 +00007077 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078}
7079
7080static
7081PyObject *unicode_str(PyUnicodeObject *self)
7082{
Fred Drakee4315f52000-05-09 19:53:39 +00007083 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084}
7085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007086PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087"S.swapcase() -> unicode\n\
7088\n\
7089Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007090and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091
7092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007093unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 return fixup(self, fixswapcase);
7096}
7097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099"S.translate(table) -> unicode\n\
7100\n\
7101Return a copy of the string S, where all characters have been mapped\n\
7102through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007103Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7104Unmapped characters are left untouched. Characters mapped to None\n\
7105are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106
7107static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007108unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109{
Tim Petersced69f82003-09-16 20:30:58 +00007110 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00007112 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 "ignore");
7114}
7115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117"S.upper() -> unicode\n\
7118\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007119Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007122unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 return fixup(self, fixupper);
7125}
7126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007127PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128"S.zfill(width) -> unicode\n\
7129\n\
7130Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133static PyObject *
7134unicode_zfill(PyUnicodeObject *self, PyObject *args)
7135{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007136 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 PyUnicodeObject *u;
7138
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t width;
7140 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 return NULL;
7142
7143 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007144 if (PyUnicode_CheckExact(self)) {
7145 Py_INCREF(self);
7146 return (PyObject*) self;
7147 }
7148 else
7149 return PyUnicode_FromUnicode(
7150 PyUnicode_AS_UNICODE(self),
7151 PyUnicode_GET_SIZE(self)
7152 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 }
7154
7155 fill = width - self->length;
7156
7157 u = pad(self, fill, 0, '0');
7158
Walter Dörwald068325e2002-04-15 13:36:47 +00007159 if (u == NULL)
7160 return NULL;
7161
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 if (u->str[fill] == '+' || u->str[fill] == '-') {
7163 /* move sign to beginning of string */
7164 u->str[0] = u->str[fill];
7165 u->str[fill] = '0';
7166 }
7167
7168 return (PyObject*) u;
7169}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170
7171#if 0
7172static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007173unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 return PyInt_FromLong(unicode_freelist_size);
7176}
7177#endif
7178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007179PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007180"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007182Return True if S starts with the specified prefix, False otherwise.\n\
7183With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007184With optional end, stop comparing S at that position.\n\
7185prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186
7187static PyObject *
7188unicode_startswith(PyUnicodeObject *self,
7189 PyObject *args)
7190{
Georg Brandl24250812006-06-09 18:45:48 +00007191 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007193 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007194 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007195 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
Georg Brandl24250812006-06-09 18:45:48 +00007197 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00007198 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007200 if (PyTuple_Check(subobj)) {
7201 Py_ssize_t i;
7202 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7203 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7204 PyTuple_GET_ITEM(subobj, i));
7205 if (substring == NULL)
7206 return NULL;
7207 result = tailmatch(self, substring, start, end, -1);
7208 Py_DECREF(substring);
7209 if (result) {
7210 Py_RETURN_TRUE;
7211 }
7212 }
7213 /* nothing matched */
7214 Py_RETURN_FALSE;
7215 }
7216 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007218 return NULL;
7219 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007221 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222}
7223
7224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007225PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007226"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007228Return True if S ends with the specified suffix, False otherwise.\n\
7229With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007230With optional end, stop comparing S at that position.\n\
7231suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232
7233static PyObject *
7234unicode_endswith(PyUnicodeObject *self,
7235 PyObject *args)
7236{
Georg Brandl24250812006-06-09 18:45:48 +00007237 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007239 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007240 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007241 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242
Georg Brandl24250812006-06-09 18:45:48 +00007243 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7244 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007246 if (PyTuple_Check(subobj)) {
7247 Py_ssize_t i;
7248 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7249 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7250 PyTuple_GET_ITEM(subobj, i));
7251 if (substring == NULL)
7252 return NULL;
7253 result = tailmatch(self, substring, start, end, +1);
7254 Py_DECREF(substring);
7255 if (result) {
7256 Py_RETURN_TRUE;
7257 }
7258 }
7259 Py_RETURN_FALSE;
7260 }
7261 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 if (substring == NULL)
Georg Brandl24250812006-06-09 18:45:48 +00007263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
Georg Brandl24250812006-06-09 18:45:48 +00007265 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007267 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268}
7269
7270
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007271
7272static PyObject *
7273unicode_getnewargs(PyUnicodeObject *v)
7274{
7275 return Py_BuildValue("(u#)", v->str, v->length);
7276}
7277
7278
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279static PyMethodDef unicode_methods[] = {
7280
7281 /* Order is according to common usage: often used methods should
7282 appear first, since lookup is done sequentially. */
7283
Georg Brandlecdc0a92006-03-30 12:19:07 +00007284 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007285 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7286 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007287 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007288 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7289 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7290 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7291 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7292 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7293 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7294 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007295 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007296 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7297 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7298 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007299 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007300 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007301/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7302 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7303 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7304 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007305 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007306 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007307 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007308 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007309 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7310 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7311 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7312 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7313 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7314 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7315 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7316 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7317 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7318 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7319 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7320 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7321 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7322 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007323 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007324#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007325 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326#endif
7327
7328#if 0
7329 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007330 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331#endif
7332
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007333 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 {NULL, NULL}
7335};
7336
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007337static PyObject *
7338unicode_mod(PyObject *v, PyObject *w)
7339{
7340 if (!PyUnicode_Check(v)) {
7341 Py_INCREF(Py_NotImplemented);
7342 return Py_NotImplemented;
7343 }
7344 return PyUnicode_Format(v, w);
7345}
7346
7347static PyNumberMethods unicode_as_number = {
7348 0, /*nb_add*/
7349 0, /*nb_subtract*/
7350 0, /*nb_multiply*/
7351 0, /*nb_divide*/
7352 unicode_mod, /*nb_remainder*/
7353};
7354
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007356 (lenfunc) unicode_length, /* sq_length */
Georg Brandl347b3002006-03-30 11:57:00 +00007357 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007358 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7359 (ssizeargfunc) unicode_getitem, /* sq_item */
7360 (ssizessizeargfunc) unicode_slice, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 0, /* sq_ass_item */
7362 0, /* sq_ass_slice */
Georg Brandl347b3002006-03-30 11:57:00 +00007363 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364};
7365
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007366static PyObject*
7367unicode_subscript(PyUnicodeObject* self, PyObject* item)
7368{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007369 if (PyIndex_Check(item)) {
7370 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007371 if (i == -1 && PyErr_Occurred())
7372 return NULL;
7373 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007374 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007375 return unicode_getitem(self, i);
7376 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007377 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007378 Py_UNICODE* source_buf;
7379 Py_UNICODE* result_buf;
7380 PyObject* result;
7381
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007382 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007383 &start, &stop, &step, &slicelength) < 0) {
7384 return NULL;
7385 }
7386
7387 if (slicelength <= 0) {
7388 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007389 } else if (start == 0 && step == 1 && slicelength == self->length &&
7390 PyUnicode_CheckExact(self)) {
7391 Py_INCREF(self);
7392 return (PyObject *)self;
7393 } else if (step == 1) {
7394 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007395 } else {
7396 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Anthony Baxtera6286212006-04-11 07:42:36 +00007397 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7398 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007399
7400 if (result_buf == NULL)
7401 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007402
7403 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7404 result_buf[i] = source_buf[cur];
7405 }
Tim Petersced69f82003-09-16 20:30:58 +00007406
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007407 result = PyUnicode_FromUnicode(result_buf, slicelength);
7408 PyMem_FREE(result_buf);
7409 return result;
7410 }
7411 } else {
7412 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7413 return NULL;
7414 }
7415}
7416
7417static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007418 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007419 (binaryfunc)unicode_subscript, /* mp_subscript */
7420 (objobjargproc)0, /* mp_ass_subscript */
7421};
7422
Martin v. Löwis18e16552006-02-15 17:27:45 +00007423static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424unicode_buffer_getreadbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007425 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 const void **ptr)
7427{
7428 if (index != 0) {
7429 PyErr_SetString(PyExc_SystemError,
7430 "accessing non-existent unicode segment");
7431 return -1;
7432 }
7433 *ptr = (void *) self->str;
7434 return PyUnicode_GET_DATA_SIZE(self);
7435}
7436
Martin v. Löwis18e16552006-02-15 17:27:45 +00007437static Py_ssize_t
7438unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 const void **ptr)
7440{
7441 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00007442 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 return -1;
7444}
7445
7446static int
7447unicode_buffer_getsegcount(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007448 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 if (lenp)
7451 *lenp = PyUnicode_GET_DATA_SIZE(self);
7452 return 1;
7453}
7454
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007455static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456unicode_buffer_getcharbuf(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007457 Py_ssize_t index,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 const void **ptr)
7459{
7460 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007461
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 if (index != 0) {
7463 PyErr_SetString(PyExc_SystemError,
7464 "accessing non-existent unicode segment");
7465 return -1;
7466 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007467 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 if (str == NULL)
7469 return -1;
7470 *ptr = (void *) PyString_AS_STRING(str);
7471 return PyString_GET_SIZE(str);
7472}
7473
7474/* Helpers for PyUnicode_Format() */
7475
7476static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007477getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007479 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 if (argidx < arglen) {
7481 (*p_argidx)++;
7482 if (arglen < 0)
7483 return args;
7484 else
7485 return PyTuple_GetItem(args, argidx);
7486 }
7487 PyErr_SetString(PyExc_TypeError,
7488 "not enough arguments for format string");
7489 return NULL;
7490}
7491
7492#define F_LJUST (1<<0)
7493#define F_SIGN (1<<1)
7494#define F_BLANK (1<<2)
7495#define F_ALT (1<<3)
7496#define F_ZERO (1<<4)
7497
Martin v. Löwis18e16552006-02-15 17:27:45 +00007498static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00007499strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007501 register Py_ssize_t i;
7502 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 for (i = len - 1; i >= 0; i--)
7504 buffer[i] = (Py_UNICODE) charbuffer[i];
7505
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 return len;
7507}
7508
Neal Norwitzfc76d632006-01-10 06:03:13 +00007509static int
7510doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7511{
Tim Peters15231542006-02-16 01:08:01 +00007512 Py_ssize_t result;
7513
Neal Norwitzfc76d632006-01-10 06:03:13 +00007514 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007515 result = strtounicode(buffer, (char *)buffer);
7516 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007517}
7518
7519static int
7520longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7521{
Tim Peters15231542006-02-16 01:08:01 +00007522 Py_ssize_t result;
7523
Neal Norwitzfc76d632006-01-10 06:03:13 +00007524 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00007525 result = strtounicode(buffer, (char *)buffer);
7526 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007527}
7528
Guido van Rossum078151d2002-08-11 04:24:12 +00007529/* XXX To save some code duplication, formatfloat/long/int could have been
7530 shared with stringobject.c, converting from 8-bit to Unicode after the
7531 formatting is done. */
7532
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533static int
7534formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007535 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 int flags,
7537 int prec,
7538 int type,
7539 PyObject *v)
7540{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007541 /* fmt = '%#.' + `prec` + `type`
7542 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 char fmt[20];
7544 double x;
Tim Petersced69f82003-09-16 20:30:58 +00007545
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 x = PyFloat_AsDouble(v);
7547 if (x == -1.0 && PyErr_Occurred())
7548 return -1;
7549 if (prec < 0)
7550 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7552 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007553 /* Worst case length calc to ensure no buffer overrun:
7554
7555 'g' formats:
7556 fmt = %#.<prec>g
7557 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7558 for any double rep.)
7559 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7560
7561 'f' formats:
7562 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7563 len = 1 + 50 + 1 + prec = 52 + prec
7564
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007565 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00007566 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007567
7568 */
Georg Brandl7c3b50d2007-07-12 08:38:00 +00007569 if (((type == 'g' || type == 'G') &&
7570 buflen <= (size_t)10 + (size_t)prec) ||
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007571 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007572 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007573 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007574 return -1;
7575 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00007576 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7577 (flags&F_ALT) ? "#" : "",
7578 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00007579 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580}
7581
Tim Peters38fd5b62000-09-21 05:43:11 +00007582static PyObject*
7583formatlong(PyObject *val, int flags, int prec, int type)
7584{
7585 char *buf;
7586 int i, len;
7587 PyObject *str; /* temporary string object. */
7588 PyUnicodeObject *result;
7589
7590 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7591 if (!str)
7592 return NULL;
7593 result = _PyUnicode_New(len);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007594 if (!result) {
7595 Py_DECREF(str);
7596 return NULL;
7597 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007598 for (i = 0; i < len; i++)
7599 result->str[i] = buf[i];
7600 result->str[len] = 0;
7601 Py_DECREF(str);
7602 return (PyObject*)result;
7603}
7604
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605static int
7606formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007607 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 int flags,
7609 int prec,
7610 int type,
7611 PyObject *v)
7612{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007613 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007614 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7615 * + 1 + 1
7616 * = 24
7617 */
Tim Peters38fd5b62000-09-21 05:43:11 +00007618 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007619 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 long x;
7621
7622 x = PyInt_AsLong(v);
7623 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007624 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007625 if (x < 0 && type == 'u') {
7626 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00007627 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007628 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7629 sign = "-";
7630 else
7631 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007633 prec = 1;
7634
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007635 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7636 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007637 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007638 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007639 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007640 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007641 return -1;
7642 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007643
7644 if ((flags & F_ALT) &&
7645 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00007646 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007647 * of issues that cause pain:
7648 * - when 0 is being converted, the C standard leaves off
7649 * the '0x' or '0X', which is inconsistent with other
7650 * %#x/%#X conversions and inconsistent with Python's
7651 * hex() function
7652 * - there are platforms that violate the standard and
7653 * convert 0 with the '0x' or '0X'
7654 * (Metrowerks, Compaq Tru64)
7655 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00007656 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007657 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00007658 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007659 * We can achieve the desired consistency by inserting our
7660 * own '0x' or '0X' prefix, and substituting %x/%X in place
7661 * of %#x/%#X.
7662 *
7663 * Note that this is the same approach as used in
7664 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007665 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007666 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7667 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00007668 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007669 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007670 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7671 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00007672 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00007673 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007674 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00007675 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007676 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00007677 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678}
7679
7680static int
7681formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007682 size_t buflen,
7683 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007685 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007686 if (PyUnicode_Check(v)) {
7687 if (PyUnicode_GET_SIZE(v) != 1)
7688 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007692 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00007693 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007694 goto onError;
7695 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697
7698 else {
7699 /* Integer input truncated to a character */
7700 long x;
7701 x = PyInt_AsLong(v);
7702 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007703 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007704#ifdef Py_UNICODE_WIDE
7705 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007706 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007707 "%c arg not in range(0x110000) "
7708 "(wide Python build)");
7709 return -1;
7710 }
7711#else
7712 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00007713 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00007714 "%c arg not in range(0x10000) "
7715 "(narrow Python build)");
7716 return -1;
7717 }
7718#endif
7719 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 }
7721 buf[1] = '\0';
7722 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007723
7724 onError:
7725 PyErr_SetString(PyExc_TypeError,
7726 "%c requires int or char");
7727 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728}
7729
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007730/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7731
7732 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7733 chars are formatted. XXX This is a magic number. Each formatting
7734 routine does bounds checking to ensure no overflow, but a better
7735 solution may be to malloc a buffer of appropriate size for each
7736 format. For now, the current solution is sufficient.
7737*/
7738#define FORMATBUFLEN (size_t)120
7739
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740PyObject *PyUnicode_Format(PyObject *format,
7741 PyObject *args)
7742{
7743 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007744 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 int args_owned = 0;
7746 PyUnicodeObject *result = NULL;
7747 PyObject *dict = NULL;
7748 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00007749
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 if (format == NULL || args == NULL) {
7751 PyErr_BadInternalCall();
7752 return NULL;
7753 }
7754 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00007755 if (uformat == NULL)
7756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 fmt = PyUnicode_AS_UNICODE(uformat);
7758 fmtcnt = PyUnicode_GET_SIZE(uformat);
7759
7760 reslen = rescnt = fmtcnt + 100;
7761 result = _PyUnicode_New(reslen);
7762 if (result == NULL)
7763 goto onError;
7764 res = PyUnicode_AS_UNICODE(result);
7765
7766 if (PyTuple_Check(args)) {
7767 arglen = PyTuple_Size(args);
7768 argidx = 0;
7769 }
7770 else {
7771 arglen = -1;
7772 argidx = -2;
7773 }
Martin v. Löwis68192102007-07-21 06:55:02 +00007774 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00007775 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776 dict = args;
7777
7778 while (--fmtcnt >= 0) {
7779 if (*fmt != '%') {
7780 if (--rescnt < 0) {
7781 rescnt = fmtcnt + 100;
7782 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007783 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00007784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7786 --rescnt;
7787 }
7788 *res++ = *fmt++;
7789 }
7790 else {
7791 /* Got a format specifier */
7792 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007793 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 Py_UNICODE c = '\0';
7796 Py_UNICODE fill;
7797 PyObject *v = NULL;
7798 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007799 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007802 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803
7804 fmt++;
7805 if (*fmt == '(') {
7806 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007807 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 PyObject *key;
7809 int pcount = 1;
7810
7811 if (dict == NULL) {
7812 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00007813 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 goto onError;
7815 }
7816 ++fmt;
7817 --fmtcnt;
7818 keystart = fmt;
7819 /* Skip over balanced parentheses */
7820 while (pcount > 0 && --fmtcnt >= 0) {
7821 if (*fmt == ')')
7822 --pcount;
7823 else if (*fmt == '(')
7824 ++pcount;
7825 fmt++;
7826 }
7827 keylen = fmt - keystart - 1;
7828 if (fmtcnt < 0 || pcount > 0) {
7829 PyErr_SetString(PyExc_ValueError,
7830 "incomplete format key");
7831 goto onError;
7832 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007833#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00007834 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 then looked up since Python uses strings to hold
7836 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00007837 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 key = PyUnicode_EncodeUTF8(keystart,
7839 keylen,
7840 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00007841#else
7842 key = PyUnicode_FromUnicode(keystart, keylen);
7843#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 if (key == NULL)
7845 goto onError;
7846 if (args_owned) {
7847 Py_DECREF(args);
7848 args_owned = 0;
7849 }
7850 args = PyObject_GetItem(dict, key);
7851 Py_DECREF(key);
7852 if (args == NULL) {
7853 goto onError;
7854 }
7855 args_owned = 1;
7856 arglen = -1;
7857 argidx = -2;
7858 }
7859 while (--fmtcnt >= 0) {
7860 switch (c = *fmt++) {
7861 case '-': flags |= F_LJUST; continue;
7862 case '+': flags |= F_SIGN; continue;
7863 case ' ': flags |= F_BLANK; continue;
7864 case '#': flags |= F_ALT; continue;
7865 case '0': flags |= F_ZERO; continue;
7866 }
7867 break;
7868 }
7869 if (c == '*') {
7870 v = getnextarg(args, arglen, &argidx);
7871 if (v == NULL)
7872 goto onError;
7873 if (!PyInt_Check(v)) {
7874 PyErr_SetString(PyExc_TypeError,
7875 "* wants int");
7876 goto onError;
7877 }
7878 width = PyInt_AsLong(v);
7879 if (width < 0) {
7880 flags |= F_LJUST;
7881 width = -width;
7882 }
7883 if (--fmtcnt >= 0)
7884 c = *fmt++;
7885 }
7886 else if (c >= '0' && c <= '9') {
7887 width = c - '0';
7888 while (--fmtcnt >= 0) {
7889 c = *fmt++;
7890 if (c < '0' || c > '9')
7891 break;
7892 if ((width*10) / 10 != width) {
7893 PyErr_SetString(PyExc_ValueError,
7894 "width too big");
7895 goto onError;
7896 }
7897 width = width*10 + (c - '0');
7898 }
7899 }
7900 if (c == '.') {
7901 prec = 0;
7902 if (--fmtcnt >= 0)
7903 c = *fmt++;
7904 if (c == '*') {
7905 v = getnextarg(args, arglen, &argidx);
7906 if (v == NULL)
7907 goto onError;
7908 if (!PyInt_Check(v)) {
7909 PyErr_SetString(PyExc_TypeError,
7910 "* wants int");
7911 goto onError;
7912 }
7913 prec = PyInt_AsLong(v);
7914 if (prec < 0)
7915 prec = 0;
7916 if (--fmtcnt >= 0)
7917 c = *fmt++;
7918 }
7919 else if (c >= '0' && c <= '9') {
7920 prec = c - '0';
7921 while (--fmtcnt >= 0) {
7922 c = Py_CHARMASK(*fmt++);
7923 if (c < '0' || c > '9')
7924 break;
7925 if ((prec*10) / 10 != prec) {
7926 PyErr_SetString(PyExc_ValueError,
7927 "prec too big");
7928 goto onError;
7929 }
7930 prec = prec*10 + (c - '0');
7931 }
7932 }
7933 } /* prec */
7934 if (fmtcnt >= 0) {
7935 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 if (--fmtcnt >= 0)
7937 c = *fmt++;
7938 }
7939 }
7940 if (fmtcnt < 0) {
7941 PyErr_SetString(PyExc_ValueError,
7942 "incomplete format");
7943 goto onError;
7944 }
7945 if (c != '%') {
7946 v = getnextarg(args, arglen, &argidx);
7947 if (v == NULL)
7948 goto onError;
7949 }
7950 sign = 0;
7951 fill = ' ';
7952 switch (c) {
7953
7954 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007955 pbuf = formatbuf;
7956 /* presume that buffer length is at least 1 */
7957 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 len = 1;
7959 break;
7960
7961 case 's':
7962 case 'r':
7963 if (PyUnicode_Check(v) && c == 's') {
7964 temp = v;
7965 Py_INCREF(temp);
7966 }
7967 else {
7968 PyObject *unicode;
7969 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007970 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 else
7972 temp = PyObject_Repr(v);
7973 if (temp == NULL)
7974 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007975 if (PyUnicode_Check(temp))
7976 /* nothing to do */;
7977 else if (PyString_Check(temp)) {
7978 /* convert to string to Unicode */
Thomas Woutersa96affe2006-03-12 00:29:36 +00007979 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 PyString_GET_SIZE(temp),
Thomas Woutersa96affe2006-03-12 00:29:36 +00007981 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 "strict");
Thomas Woutersa96affe2006-03-12 00:29:36 +00007983 Py_DECREF(temp);
7984 temp = unicode;
7985 if (temp == NULL)
7986 goto onError;
7987 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007988 else {
7989 Py_DECREF(temp);
7990 PyErr_SetString(PyExc_TypeError,
7991 "%s argument has non-string str()");
7992 goto onError;
7993 }
7994 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007995 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 len = PyUnicode_GET_SIZE(temp);
7997 if (prec >= 0 && len > prec)
7998 len = prec;
7999 break;
8000
8001 case 'i':
8002 case 'd':
8003 case 'u':
8004 case 'o':
8005 case 'x':
8006 case 'X':
8007 if (c == 'i')
8008 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00008009 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008010 temp = formatlong(v, flags, prec, c);
8011 if (!temp)
8012 goto onError;
8013 pbuf = PyUnicode_AS_UNICODE(temp);
8014 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00008015 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008017 else {
8018 pbuf = formatbuf;
8019 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8020 flags, prec, c, v);
8021 if (len < 0)
8022 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008023 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008024 }
8025 if (flags & F_ZERO)
8026 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 break;
8028
8029 case 'e':
8030 case 'E':
8031 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008032 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 case 'g':
8034 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00008035 if (c == 'F')
8036 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008037 pbuf = formatbuf;
8038 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8039 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 if (len < 0)
8041 goto onError;
8042 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00008043 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 fill = '0';
8045 break;
8046
8047 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008048 pbuf = formatbuf;
8049 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 if (len < 0)
8051 goto onError;
8052 break;
8053
8054 default:
8055 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00008056 "unsupported format character '%c' (0x%x) "
Armin Rigo7ccbca92006-10-04 12:17:45 +00008057 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00008058 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00008059 (int)c,
Armin Rigo7ccbca92006-10-04 12:17:45 +00008060 (Py_ssize_t)(fmt - 1 -
8061 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 goto onError;
8063 }
8064 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008065 if (*pbuf == '-' || *pbuf == '+') {
8066 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 len--;
8068 }
8069 else if (flags & F_SIGN)
8070 sign = '+';
8071 else if (flags & F_BLANK)
8072 sign = ' ';
8073 else
8074 sign = 0;
8075 }
8076 if (width < len)
8077 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008078 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 reslen -= rescnt;
8080 rescnt = width + fmtcnt + 100;
8081 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008082 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00008083 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00008084 PyErr_NoMemory();
8085 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00008086 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00008087 if (_PyUnicode_Resize(&result, reslen) < 0) {
8088 Py_XDECREF(temp);
8089 goto onError;
8090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 res = PyUnicode_AS_UNICODE(result)
8092 + reslen - rescnt;
8093 }
8094 if (sign) {
8095 if (fill != ' ')
8096 *res++ = sign;
8097 rescnt--;
8098 if (width > len)
8099 width--;
8100 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008101 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8102 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008103 assert(pbuf[1] == c);
8104 if (fill != ' ') {
8105 *res++ = *pbuf++;
8106 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00008107 }
Tim Petersfff53252001-04-12 18:38:48 +00008108 rescnt -= 2;
8109 width -= 2;
8110 if (width < 0)
8111 width = 0;
8112 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00008113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 if (width > len && !(flags & F_LJUST)) {
8115 do {
8116 --rescnt;
8117 *res++ = fill;
8118 } while (--width > len);
8119 }
Tim Peters38fd5b62000-09-21 05:43:11 +00008120 if (fill == ' ') {
8121 if (sign)
8122 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00008123 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00008124 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00008125 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00008126 *res++ = *pbuf++;
8127 *res++ = *pbuf++;
8128 }
8129 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008130 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 res += len;
8132 rescnt -= len;
8133 while (--width >= len) {
8134 --rescnt;
8135 *res++ = ' ';
8136 }
8137 if (dict && (argidx < arglen) && c != '%') {
8138 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008139 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008140 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 goto onError;
8142 }
8143 Py_XDECREF(temp);
8144 } /* '%' */
8145 } /* until end */
8146 if (argidx < arglen && !dict) {
8147 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00008148 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 goto onError;
8150 }
8151
Thomas Woutersa96affe2006-03-12 00:29:36 +00008152 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8153 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 if (args_owned) {
8155 Py_DECREF(args);
8156 }
8157 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 return (PyObject *)result;
8159
8160 onError:
8161 Py_XDECREF(result);
8162 Py_DECREF(uformat);
8163 if (args_owned) {
8164 Py_DECREF(args);
8165 }
8166 return NULL;
8167}
8168
8169static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008170 (readbufferproc) unicode_buffer_getreadbuf,
8171 (writebufferproc) unicode_buffer_getwritebuf,
8172 (segcountproc) unicode_buffer_getsegcount,
8173 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174};
8175
Jeremy Hylton938ace62002-07-17 16:30:39 +00008176static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008177unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8178
Tim Peters6d6c1a32001-08-02 04:15:00 +00008179static PyObject *
8180unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8181{
8182 PyObject *x = NULL;
Martin v. Löwis15e62742006-02-27 16:46:16 +00008183 static char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00008184 char *encoding = NULL;
8185 char *errors = NULL;
8186
Guido van Rossume023fe02001-08-30 03:12:59 +00008187 if (type != &PyUnicode_Type)
8188 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008189 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8190 kwlist, &x, &encoding, &errors))
8191 return NULL;
8192 if (x == NULL)
8193 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00008194 if (encoding == NULL && errors == NULL)
8195 return PyObject_Unicode(x);
8196 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00008197 return PyUnicode_FromEncodedObject(x, encoding, errors);
8198}
8199
Guido van Rossume023fe02001-08-30 03:12:59 +00008200static PyObject *
8201unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8202{
Tim Petersaf90b3e2001-09-12 05:18:58 +00008203 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008204 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008205
8206 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8207 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8208 if (tmp == NULL)
8209 return NULL;
8210 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00008211 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008212 if (pnew == NULL) {
8213 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00008214 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00008215 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008216 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8217 if (pnew->str == NULL) {
8218 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008219 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00008220 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00008221 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00008222 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00008223 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8224 pnew->length = n;
8225 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00008226 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00008227 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008228}
8229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008230PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00008231"unicode(string [, encoding[, errors]]) -> object\n\
8232\n\
8233Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008234encoding defaults to the current default string encoding.\n\
8235errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008236
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008238 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 "unicode", /* tp_name */
8240 sizeof(PyUnicodeObject), /* tp_size */
8241 0, /* tp_itemsize */
8242 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00008243 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008245 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 0, /* tp_setattr */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008247 0, /* tp_compare */
Georg Brandl347b3002006-03-30 11:57:00 +00008248 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008249 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008251 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 (hashfunc) unicode_hash, /* tp_hash*/
8253 0, /* tp_call*/
8254 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008255 PyObject_GenericGetAttr, /* tp_getattro */
8256 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008258 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Neal Norwitzee3a1b52007-02-25 19:44:48 +00008259 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008260 unicode_doc, /* tp_doc */
8261 0, /* tp_traverse */
8262 0, /* tp_clear */
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00008263 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008264 0, /* tp_weaklistoffset */
8265 0, /* tp_iter */
8266 0, /* tp_iternext */
8267 unicode_methods, /* tp_methods */
8268 0, /* tp_members */
8269 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00008270 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00008271 0, /* tp_dict */
8272 0, /* tp_descr_get */
8273 0, /* tp_descr_set */
8274 0, /* tp_dictoffset */
8275 0, /* tp_init */
8276 0, /* tp_alloc */
8277 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008278 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279};
8280
8281/* Initialize the Unicode implementation */
8282
Thomas Wouters78890102000-07-22 19:25:51 +00008283void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008285 int i;
8286
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008287 /* XXX - move this array to unicodectype.c ? */
8288 Py_UNICODE linebreak[] = {
8289 0x000A, /* LINE FEED */
8290 0x000D, /* CARRIAGE RETURN */
8291 0x001C, /* FILE SEPARATOR */
8292 0x001D, /* GROUP SEPARATOR */
8293 0x001E, /* RECORD SEPARATOR */
8294 0x0085, /* NEXT LINE */
8295 0x2028, /* LINE SEPARATOR */
8296 0x2029, /* PARAGRAPH SEPARATOR */
8297 };
8298
Fred Drakee4315f52000-05-09 19:53:39 +00008299 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008300 unicode_freelist = NULL;
8301 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008303 if (!unicode_empty)
8304 return;
8305
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008306 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008307 for (i = 0; i < 256; i++)
8308 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008309 if (PyType_Ready(&PyUnicode_Type) < 0)
8310 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008311
8312 /* initialize the linebreak bloom filter */
8313 bloom_linebreak = make_bloom_mask(
8314 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8315 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008316
8317 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318}
8319
8320/* Finalize the Unicode implementation */
8321
8322void
Thomas Wouters78890102000-07-22 19:25:51 +00008323_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008325 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008326 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008328 Py_XDECREF(unicode_empty);
8329 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008330
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008331 for (i = 0; i < 256; i++) {
8332 if (unicode_latin1[i]) {
8333 Py_DECREF(unicode_latin1[i]);
8334 unicode_latin1[i] = NULL;
8335 }
8336 }
8337
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008338 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 PyUnicodeObject *v = u;
8340 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00008341 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00008342 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008343 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00008344 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008346 unicode_freelist = NULL;
8347 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008349
Anthony Baxterac6bd462006-04-13 02:06:09 +00008350#ifdef __cplusplus
8351}
8352#endif
8353
8354
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008355/*
8356Local variables:
8357c-basic-offset: 4
8358indent-tabs-mode: nil
8359End:
8360*/