blob: a409b2263b77439c5fb2aaab6f0a2b4c296fa2d9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000129/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000130/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000131/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166/* 0x000B, * LINE TABULATION */
167/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000169 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x001C, * FILE SEPARATOR */
172/* 0x001D, * GROUP SEPARATOR */
173/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 1, 1, 1, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000179
Benjamin Peterson14339b62009-01-31 16:36:08 +0000180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000188};
189
190
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000192PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000194#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 /* This is actually an illegal character, so it should
198 not be passed to unichr. */
199 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000200#endif
201}
202
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203/* --- Bloom Filters ----------------------------------------------------- */
204
205/* stuff to implement simple "bloom filters" for Unicode characters.
206 to keep things simple, we use a single bitmask, using the least 5
207 bits from each unicode characters as the bit index. */
208
209/* the linebreak mask is set up by Unicode_Init below */
210
Antoine Pitrouf068f942010-01-13 14:19:12 +0000211#if LONG_BIT >= 128
212#define BLOOM_WIDTH 128
213#elif LONG_BIT >= 64
214#define BLOOM_WIDTH 64
215#elif LONG_BIT >= 32
216#define BLOOM_WIDTH 32
217#else
218#error "LONG_BIT is smaller than 32"
219#endif
220
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221#define BLOOM_MASK unsigned long
222
223static BLOOM_MASK bloom_linebreak;
224
Antoine Pitrouf068f942010-01-13 14:19:12 +0000225#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
226#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Benjamin Peterson29060642009-01-31 22:14:21 +0000228#define BLOOM_LINEBREAK(ch) \
229 ((ch) < 128U ? ascii_linebreak[(ch)] : \
230 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231
232Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
233{
234 /* calculate simple bloom-style bitmask for a given unicode string */
235
Antoine Pitrouf068f942010-01-13 14:19:12 +0000236 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000237 Py_ssize_t i;
238
239 mask = 0;
240 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000241 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000242
243 return mask;
244}
245
246Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
247{
248 Py_ssize_t i;
249
250 for (i = 0; i < setlen; i++)
251 if (set[i] == chr)
252 return 1;
253
254 return 0;
255}
256
Benjamin Peterson29060642009-01-31 22:14:21 +0000257#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000258 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
259
Guido van Rossumd57fd912000-03-10 22:53:23 +0000260/* --- Unicode Object ----------------------------------------------------- */
261
262static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265{
266 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000267
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000268 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000272 /* Resizing shared object (unicode_empty or single character
273 objects) in-place is not allowed. Use PyUnicode_Resize()
274 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000275
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000277 (unicode->length == 1 &&
278 unicode->str[0] < 256U &&
279 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000281 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282 return -1;
283 }
284
Thomas Wouters477c8d52006-05-27 19:21:47 +0000285 /* We allocate one more byte to make sure the string is Ux0000 terminated.
286 The overallocation is also used by fastsearch, which assumes that it's
287 safe to look at str[length] (without making any assumptions about what
288 it contains). */
289
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000291 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000294 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295 PyErr_NoMemory();
296 return -1;
297 }
298 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
Benjamin Peterson29060642009-01-31 22:14:21 +0000301 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 }
307 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000308
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 return 0;
310}
311
312/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000313 Ux0000 terminated; some code (e.g. new_identifier)
314 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000317 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318
319*/
320
321static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000322PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323{
324 register PyUnicodeObject *unicode;
325
Thomas Wouters477c8d52006-05-27 19:21:47 +0000326 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000327 if (length == 0 && unicode_empty != NULL) {
328 Py_INCREF(unicode_empty);
329 return unicode_empty;
330 }
331
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000332 /* Ensure we won't overflow the size. */
333 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
334 return (PyUnicodeObject *)PyErr_NoMemory();
335 }
336
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000338 if (free_list) {
339 unicode = free_list;
340 free_list = *(PyUnicodeObject **)unicode;
341 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000342 if (unicode->str) {
343 /* Keep-Alive optimization: we only upsize the buffer,
344 never downsize it. */
345 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000346 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 PyObject_DEL(unicode->str);
348 unicode->str = NULL;
349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000352 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
353 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000354 }
355 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 }
357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000359 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 if (unicode == NULL)
361 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000362 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
363 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 }
365
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000367 PyErr_NoMemory();
368 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000369 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000371 * the caller fails before initializing str -- unicode_resize()
372 * reads str[0], and the Keep-Alive optimization can keep memory
373 * allocated for str alive across a call to unicode_dealloc(unicode).
374 * We don't want unicode_resize to read uninitialized memory in
375 * that case.
376 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000377 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000381 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000382 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384
Benjamin Peterson29060642009-01-31 22:14:21 +0000385 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000386 /* XXX UNREF/NEWREF interface should be more symmetrical */
387 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000388 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000389 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391}
392
393static
Guido van Rossum9475a232001-10-05 20:51:39 +0000394void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395{
Walter Dörwald16807132007-05-25 13:52:07 +0000396 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_NOT_INTERNED:
398 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000399
Benjamin Peterson29060642009-01-31 22:14:21 +0000400 case SSTATE_INTERNED_MORTAL:
401 /* revive dead object temporarily for DelItem */
402 Py_REFCNT(unicode) = 3;
403 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
404 Py_FatalError(
405 "deletion of interned string failed");
406 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 case SSTATE_INTERNED_IMMORTAL:
409 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 default:
412 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000413 }
414
Guido van Rossum604ddf82001-12-06 20:03:56 +0000415 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000416 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000417 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
419 PyObject_DEL(unicode->str);
420 unicode->str = NULL;
421 unicode->length = 0;
422 }
423 if (unicode->defenc) {
424 Py_DECREF(unicode->defenc);
425 unicode->defenc = NULL;
426 }
427 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000428 *(PyUnicodeObject **)unicode = free_list;
429 free_list = unicode;
430 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000433 PyObject_DEL(unicode->str);
434 Py_XDECREF(unicode->defenc);
435 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436 }
437}
438
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000439static
440int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441{
442 register PyUnicodeObject *v;
443
444 /* Argument checks */
445 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000449 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000450 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 PyErr_BadInternalCall();
452 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453 }
454
455 /* Resizing unicode_empty and single character objects is not
456 possible since these are being shared. We simply return a fresh
457 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000458 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000459 (v == unicode_empty || v->length == 1)) {
460 PyUnicodeObject *w = _PyUnicode_New(length);
461 if (w == NULL)
462 return -1;
463 Py_UNICODE_COPY(w->str, v->str,
464 length < v->length ? length : v->length);
465 Py_DECREF(*unicode);
466 *unicode = w;
467 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000468 }
469
470 /* Note that we don't have to modify *unicode for unshared Unicode
471 objects, since we can modify them in-place. */
472 return unicode_resize(v, length);
473}
474
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000475int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
476{
477 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
478}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479
Guido van Rossumd57fd912000-03-10 22:53:23 +0000480PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482{
483 PyUnicodeObject *unicode;
484
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000485 /* If the Unicode data is known at construction time, we can apply
486 some optimizations which share commonly used objects. */
487 if (u != NULL) {
488
Benjamin Peterson29060642009-01-31 22:14:21 +0000489 /* Optimization for empty strings */
490 if (size == 0 && unicode_empty != NULL) {
491 Py_INCREF(unicode_empty);
492 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000493 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000494
495 /* Single character Unicode objects in the Latin-1 range are
496 shared when using this constructor */
497 if (size == 1 && *u < 256) {
498 unicode = unicode_latin1[*u];
499 if (!unicode) {
500 unicode = _PyUnicode_New(1);
501 if (!unicode)
502 return NULL;
503 unicode->str[0] = *u;
504 unicode_latin1[*u] = unicode;
505 }
506 Py_INCREF(unicode);
507 return (PyObject *)unicode;
508 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000509 }
Tim Petersced69f82003-09-16 20:30:58 +0000510
Guido van Rossumd57fd912000-03-10 22:53:23 +0000511 unicode = _PyUnicode_New(size);
512 if (!unicode)
513 return NULL;
514
515 /* Copy the Unicode data into the new object */
516 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000517 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518
519 return (PyObject *)unicode;
520}
521
Walter Dörwaldd2034312007-05-18 16:29:38 +0000522PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523{
524 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000525
Benjamin Peterson14339b62009-01-31 16:36:08 +0000526 if (size < 0) {
527 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000528 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 return NULL;
530 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000531
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000532 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000533 some optimizations which share commonly used objects.
534 Also, this means the input must be UTF-8, so fall back to the
535 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 if (u != NULL) {
537
Benjamin Peterson29060642009-01-31 22:14:21 +0000538 /* Optimization for empty strings */
539 if (size == 0 && unicode_empty != NULL) {
540 Py_INCREF(unicode_empty);
541 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000542 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000543
544 /* Single characters are shared when using this constructor.
545 Restrict to ASCII, since the input must be UTF-8. */
546 if (size == 1 && Py_CHARMASK(*u) < 128) {
547 unicode = unicode_latin1[Py_CHARMASK(*u)];
548 if (!unicode) {
549 unicode = _PyUnicode_New(1);
550 if (!unicode)
551 return NULL;
552 unicode->str[0] = Py_CHARMASK(*u);
553 unicode_latin1[Py_CHARMASK(*u)] = unicode;
554 }
555 Py_INCREF(unicode);
556 return (PyObject *)unicode;
557 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000558
559 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000560 }
561
Walter Dörwald55507312007-05-18 13:12:10 +0000562 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000563 if (!unicode)
564 return NULL;
565
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000566 return (PyObject *)unicode;
567}
568
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569PyObject *PyUnicode_FromString(const char *u)
570{
571 size_t size = strlen(u);
572 if (size > PY_SSIZE_T_MAX) {
573 PyErr_SetString(PyExc_OverflowError, "input too long");
574 return NULL;
575 }
576
577 return PyUnicode_FromStringAndSize(u, size);
578}
579
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580#ifdef HAVE_WCHAR_H
581
Mark Dickinson081dfee2009-03-18 14:47:41 +0000582#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
583# define CONVERT_WCHAR_TO_SURROGATES
584#endif
585
586#ifdef CONVERT_WCHAR_TO_SURROGATES
587
588/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
589 to convert from UTF32 to UTF16. */
590
591PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
592 Py_ssize_t size)
593{
594 PyUnicodeObject *unicode;
595 register Py_ssize_t i;
596 Py_ssize_t alloc;
597 const wchar_t *orig_w;
598
599 if (w == NULL) {
600 if (size == 0)
601 return PyUnicode_FromStringAndSize(NULL, 0);
602 PyErr_BadInternalCall();
603 return NULL;
604 }
605
606 if (size == -1) {
607 size = wcslen(w);
608 }
609
610 alloc = size;
611 orig_w = w;
612 for (i = size; i > 0; i--) {
613 if (*w > 0xFFFF)
614 alloc++;
615 w++;
616 }
617 w = orig_w;
618 unicode = _PyUnicode_New(alloc);
619 if (!unicode)
620 return NULL;
621
622 /* Copy the wchar_t data into the new object */
623 {
624 register Py_UNICODE *u;
625 u = PyUnicode_AS_UNICODE(unicode);
626 for (i = size; i > 0; i--) {
627 if (*w > 0xFFFF) {
628 wchar_t ordinal = *w++;
629 ordinal -= 0x10000;
630 *u++ = 0xD800 | (ordinal >> 10);
631 *u++ = 0xDC00 | (ordinal & 0x3FF);
632 }
633 else
634 *u++ = *w++;
635 }
636 }
637 return (PyObject *)unicode;
638}
639
640#else
641
Guido van Rossumd57fd912000-03-10 22:53:23 +0000642PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000643 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644{
645 PyUnicodeObject *unicode;
646
647 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000648 if (size == 0)
649 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000650 PyErr_BadInternalCall();
651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652 }
653
Martin v. Löwis790465f2008-04-05 20:41:37 +0000654 if (size == -1) {
655 size = wcslen(w);
656 }
657
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 unicode = _PyUnicode_New(size);
659 if (!unicode)
660 return NULL;
661
662 /* Copy the wchar_t data into the new object */
663#ifdef HAVE_USABLE_WCHAR_T
664 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000665#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000667 register Py_UNICODE *u;
668 register Py_ssize_t i;
669 u = PyUnicode_AS_UNICODE(unicode);
670 for (i = size; i > 0; i--)
671 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000672 }
673#endif
674
675 return (PyObject *)unicode;
676}
677
Mark Dickinson081dfee2009-03-18 14:47:41 +0000678#endif /* CONVERT_WCHAR_TO_SURROGATES */
679
680#undef CONVERT_WCHAR_TO_SURROGATES
681
Walter Dörwald346737f2007-05-31 10:44:43 +0000682static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000683makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
684 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000685{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000686 *fmt++ = '%';
687 if (width) {
688 if (zeropad)
689 *fmt++ = '0';
690 fmt += sprintf(fmt, "%d", width);
691 }
692 if (precision)
693 fmt += sprintf(fmt, ".%d", precision);
694 if (longflag)
695 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000696 else if (longlongflag) {
697 /* longlongflag should only ever be nonzero on machines with
698 HAVE_LONG_LONG defined */
699#ifdef HAVE_LONG_LONG
700 char *f = PY_FORMAT_LONG_LONG;
701 while (*f)
702 *fmt++ = *f++;
703#else
704 /* we shouldn't ever get here */
705 assert(0);
706 *fmt++ = 'l';
707#endif
708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000709 else if (size_tflag) {
710 char *f = PY_FORMAT_SIZE_T;
711 while (*f)
712 *fmt++ = *f++;
713 }
714 *fmt++ = c;
715 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000716}
717
Walter Dörwaldd2034312007-05-18 16:29:38 +0000718#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
719
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000720/* size of fixed-size buffer for formatting single arguments */
721#define ITEM_BUFFER_LEN 21
722/* maximum number of characters required for output of %ld. 21 characters
723 allows for 64-bit integers (in decimal) and an optional sign. */
724#define MAX_LONG_CHARS 21
725/* maximum number of characters required for output of %lld.
726 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
727 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
728#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
729
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730PyObject *
731PyUnicode_FromFormatV(const char *format, va_list vargs)
732{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000733 va_list count;
734 Py_ssize_t callcount = 0;
735 PyObject **callresults = NULL;
736 PyObject **callresult = NULL;
737 Py_ssize_t n = 0;
738 int width = 0;
739 int precision = 0;
740 int zeropad;
741 const char* f;
742 Py_UNICODE *s;
743 PyObject *string;
744 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000745 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 /* use abuffer instead of buffer, if we need more space
747 * (which can happen if there's a format specifier with width). */
748 char *abuffer = NULL;
749 char *realbuffer;
750 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000751 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000753
754#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000755 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000756#else
757#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000759#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000760 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000761#endif
762#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000763 /* step 1: count the number of %S/%R/%A/%s format specifications
764 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
765 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
766 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000767 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000768 if (*f == '%') {
769 if (*(f+1)=='%')
770 continue;
771 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
772 ++callcount;
773 while (ISDIGIT((unsigned)*f))
774 width = (width*10) + *f++ - '0';
775 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
776 ;
777 if (*f == 's')
778 ++callcount;
779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000780 }
781 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000782 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000783 if (callcount) {
784 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
785 if (!callresults) {
786 PyErr_NoMemory();
787 return NULL;
788 }
789 callresult = callresults;
790 }
791 /* step 3: figure out how large a buffer we need */
792 for (f = format; *f; f++) {
793 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000794#ifdef HAVE_LONG_LONG
795 int longlongflag = 0;
796#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 const char* p = f;
798 width = 0;
799 while (ISDIGIT((unsigned)*f))
800 width = (width*10) + *f++ - '0';
801 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
802 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000803
Benjamin Peterson14339b62009-01-31 16:36:08 +0000804 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
805 * they don't affect the amount of space we reserve.
806 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000807 if (*f == 'l') {
808 if (f[1] == 'd' || f[1] == 'u') {
809 ++f;
810 }
811#ifdef HAVE_LONG_LONG
812 else if (f[1] == 'l' &&
813 (f[2] == 'd' || f[2] == 'u')) {
814 longlongflag = 1;
815 f += 2;
816 }
817#endif
818 }
819 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000820 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000821 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000822
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 switch (*f) {
824 case 'c':
825 (void)va_arg(count, int);
826 /* fall through... */
827 case '%':
828 n++;
829 break;
830 case 'd': case 'u': case 'i': case 'x':
831 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000832#ifdef HAVE_LONG_LONG
833 if (longlongflag) {
834 if (width < MAX_LONG_LONG_CHARS)
835 width = MAX_LONG_LONG_CHARS;
836 }
837 else
838#endif
839 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
840 including sign. Decimal takes the most space. This
841 isn't enough for octal. If a width is specified we
842 need more (which we allocate later). */
843 if (width < MAX_LONG_CHARS)
844 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000845 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000846 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 if (abuffersize < width)
848 abuffersize = width;
849 break;
850 case 's':
851 {
852 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000853 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000854 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
855 if (!str)
856 goto fail;
857 n += PyUnicode_GET_SIZE(str);
858 /* Remember the str and switch to the next slot */
859 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 break;
861 }
862 case 'U':
863 {
864 PyObject *obj = va_arg(count, PyObject *);
865 assert(obj && PyUnicode_Check(obj));
866 n += PyUnicode_GET_SIZE(obj);
867 break;
868 }
869 case 'V':
870 {
871 PyObject *obj = va_arg(count, PyObject *);
872 const char *str = va_arg(count, const char *);
873 assert(obj || str);
874 assert(!obj || PyUnicode_Check(obj));
875 if (obj)
876 n += PyUnicode_GET_SIZE(obj);
877 else
878 n += strlen(str);
879 break;
880 }
881 case 'S':
882 {
883 PyObject *obj = va_arg(count, PyObject *);
884 PyObject *str;
885 assert(obj);
886 str = PyObject_Str(obj);
887 if (!str)
888 goto fail;
889 n += PyUnicode_GET_SIZE(str);
890 /* Remember the str and switch to the next slot */
891 *callresult++ = str;
892 break;
893 }
894 case 'R':
895 {
896 PyObject *obj = va_arg(count, PyObject *);
897 PyObject *repr;
898 assert(obj);
899 repr = PyObject_Repr(obj);
900 if (!repr)
901 goto fail;
902 n += PyUnicode_GET_SIZE(repr);
903 /* Remember the repr and switch to the next slot */
904 *callresult++ = repr;
905 break;
906 }
907 case 'A':
908 {
909 PyObject *obj = va_arg(count, PyObject *);
910 PyObject *ascii;
911 assert(obj);
912 ascii = PyObject_ASCII(obj);
913 if (!ascii)
914 goto fail;
915 n += PyUnicode_GET_SIZE(ascii);
916 /* Remember the repr and switch to the next slot */
917 *callresult++ = ascii;
918 break;
919 }
920 case 'p':
921 (void) va_arg(count, int);
922 /* maximum 64-bit pointer representation:
923 * 0xffffffffffffffff
924 * so 19 characters is enough.
925 * XXX I count 18 -- what's the extra for?
926 */
927 n += 19;
928 break;
929 default:
930 /* if we stumble upon an unknown
931 formatting code, copy the rest of
932 the format string to the output
933 string. (we cannot just skip the
934 code, since there's no way to know
935 what's in the argument list) */
936 n += strlen(p);
937 goto expand;
938 }
939 } else
940 n++;
941 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000942 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000943 if (abuffersize > ITEM_BUFFER_LEN) {
944 /* add 1 for sprintf's trailing null byte */
945 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000946 if (!abuffer) {
947 PyErr_NoMemory();
948 goto fail;
949 }
950 realbuffer = abuffer;
951 }
952 else
953 realbuffer = buffer;
954 /* step 4: fill the buffer */
955 /* Since we've analyzed how much space we need for the worst case,
956 we don't have to resize the string.
957 There can be no errors beyond this point. */
958 string = PyUnicode_FromUnicode(NULL, n);
959 if (!string)
960 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000961
Benjamin Peterson14339b62009-01-31 16:36:08 +0000962 s = PyUnicode_AS_UNICODE(string);
963 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000964
Benjamin Peterson14339b62009-01-31 16:36:08 +0000965 for (f = format; *f; f++) {
966 if (*f == '%') {
967 const char* p = f++;
968 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000969 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000970 int size_tflag = 0;
971 zeropad = (*f == '0');
972 /* parse the width.precision part */
973 width = 0;
974 while (ISDIGIT((unsigned)*f))
975 width = (width*10) + *f++ - '0';
976 precision = 0;
977 if (*f == '.') {
978 f++;
979 while (ISDIGIT((unsigned)*f))
980 precision = (precision*10) + *f++ - '0';
981 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000982 /* Handle %ld, %lu, %lld and %llu. */
983 if (*f == 'l') {
984 if (f[1] == 'd' || f[1] == 'u') {
985 longflag = 1;
986 ++f;
987 }
988#ifdef HAVE_LONG_LONG
989 else if (f[1] == 'l' &&
990 (f[2] == 'd' || f[2] == 'u')) {
991 longlongflag = 1;
992 f += 2;
993 }
994#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000995 }
996 /* handle the size_t flag. */
997 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
998 size_tflag = 1;
999 ++f;
1000 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001001
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 switch (*f) {
1003 case 'c':
1004 *s++ = va_arg(vargs, int);
1005 break;
1006 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001007 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1008 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 if (longflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001011#ifdef HAVE_LONG_LONG
1012 else if (longlongflag)
1013 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1014#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 else if (size_tflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1017 else
1018 sprintf(realbuffer, fmt, va_arg(vargs, int));
1019 appendstring(realbuffer);
1020 break;
1021 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001022 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1023 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 if (longflag)
1025 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001026#ifdef HAVE_LONG_LONG
1027 else if (longlongflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs,
1029 unsigned PY_LONG_LONG));
1030#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001031 else if (size_tflag)
1032 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1033 else
1034 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1035 appendstring(realbuffer);
1036 break;
1037 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001038 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001039 sprintf(realbuffer, fmt, va_arg(vargs, int));
1040 appendstring(realbuffer);
1041 break;
1042 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001043 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 sprintf(realbuffer, fmt, va_arg(vargs, int));
1045 appendstring(realbuffer);
1046 break;
1047 case 's':
1048 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001049 /* unused, since we already have the result */
1050 (void) va_arg(vargs, char *);
1051 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1052 PyUnicode_GET_SIZE(*callresult));
1053 s += PyUnicode_GET_SIZE(*callresult);
1054 /* We're done with the unicode()/repr() => forget it */
1055 Py_DECREF(*callresult);
1056 /* switch to next unicode()/repr() result */
1057 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001058 break;
1059 }
1060 case 'U':
1061 {
1062 PyObject *obj = va_arg(vargs, PyObject *);
1063 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1064 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1065 s += size;
1066 break;
1067 }
1068 case 'V':
1069 {
1070 PyObject *obj = va_arg(vargs, PyObject *);
1071 const char *str = va_arg(vargs, const char *);
1072 if (obj) {
1073 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1074 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1075 s += size;
1076 } else {
1077 appendstring(str);
1078 }
1079 break;
1080 }
1081 case 'S':
1082 case 'R':
1083 {
1084 Py_UNICODE *ucopy;
1085 Py_ssize_t usize;
1086 Py_ssize_t upos;
1087 /* unused, since we already have the result */
1088 (void) va_arg(vargs, PyObject *);
1089 ucopy = PyUnicode_AS_UNICODE(*callresult);
1090 usize = PyUnicode_GET_SIZE(*callresult);
1091 for (upos = 0; upos<usize;)
1092 *s++ = ucopy[upos++];
1093 /* We're done with the unicode()/repr() => forget it */
1094 Py_DECREF(*callresult);
1095 /* switch to next unicode()/repr() result */
1096 ++callresult;
1097 break;
1098 }
1099 case 'p':
1100 sprintf(buffer, "%p", va_arg(vargs, void*));
1101 /* %p is ill-defined: ensure leading 0x. */
1102 if (buffer[1] == 'X')
1103 buffer[1] = 'x';
1104 else if (buffer[1] != 'x') {
1105 memmove(buffer+2, buffer, strlen(buffer)+1);
1106 buffer[0] = '0';
1107 buffer[1] = 'x';
1108 }
1109 appendstring(buffer);
1110 break;
1111 case '%':
1112 *s++ = '%';
1113 break;
1114 default:
1115 appendstring(p);
1116 goto end;
1117 }
1118 } else
1119 *s++ = *f;
1120 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001121
Benjamin Peterson29060642009-01-31 22:14:21 +00001122 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001123 if (callresults)
1124 PyObject_Free(callresults);
1125 if (abuffer)
1126 PyObject_Free(abuffer);
1127 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1128 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001129 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001130 if (callresults) {
1131 PyObject **callresult2 = callresults;
1132 while (callresult2 < callresult) {
1133 Py_DECREF(*callresult2);
1134 ++callresult2;
1135 }
1136 PyObject_Free(callresults);
1137 }
1138 if (abuffer)
1139 PyObject_Free(abuffer);
1140 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001141}
1142
1143#undef appendstring
1144
1145PyObject *
1146PyUnicode_FromFormat(const char *format, ...)
1147{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 PyObject* ret;
1149 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150
1151#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001153#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001156 ret = PyUnicode_FromFormatV(format, vargs);
1157 va_end(vargs);
1158 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001159}
1160
Martin v. Löwis18e16552006-02-15 17:27:45 +00001161Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 wchar_t *w,
1163 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164{
1165 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 PyErr_BadInternalCall();
1167 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001169
1170 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001173
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174#ifdef HAVE_USABLE_WCHAR_T
1175 memcpy(w, unicode->str, size * sizeof(wchar_t));
1176#else
1177 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 register Py_UNICODE *u;
1179 register Py_ssize_t i;
1180 u = PyUnicode_AS_UNICODE(unicode);
1181 for (i = size; i > 0; i--)
1182 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 }
1184#endif
1185
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001186 if (size > PyUnicode_GET_SIZE(unicode))
1187 return PyUnicode_GET_SIZE(unicode);
1188 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001189 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190}
1191
1192#endif
1193
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001194PyObject *PyUnicode_FromOrdinal(int ordinal)
1195{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001196 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001197
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001198 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001199 PyErr_SetString(PyExc_ValueError,
1200 "chr() arg not in range(0x110000)");
1201 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001202 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001203
1204#ifndef Py_UNICODE_WIDE
1205 if (ordinal > 0xffff) {
1206 ordinal -= 0x10000;
1207 s[0] = 0xD800 | (ordinal >> 10);
1208 s[1] = 0xDC00 | (ordinal & 0x3FF);
1209 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001210 }
1211#endif
1212
Hye-Shik Chang40574832004-04-06 07:24:51 +00001213 s[0] = (Py_UNICODE)ordinal;
1214 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001215}
1216
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217PyObject *PyUnicode_FromObject(register PyObject *obj)
1218{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001219 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001220 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001221 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001222 Py_INCREF(obj);
1223 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001224 }
1225 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 /* For a Unicode subtype that's not a Unicode object,
1227 return a true Unicode object with the same data. */
1228 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1229 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001230 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001231 PyErr_Format(PyExc_TypeError,
1232 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001233 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001234 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001235}
1236
1237PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 const char *encoding,
1239 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001240{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001241 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001242 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001243 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001244
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001246 PyErr_BadInternalCall();
1247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001249
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001250 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001251 PyErr_SetString(PyExc_TypeError,
1252 "decoding str is not supported");
1253 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001254 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001255
1256 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001257 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001258 s = PyBytes_AS_STRING(obj);
1259 len = PyBytes_GET_SIZE(obj);
1260 }
1261 else if (PyByteArray_Check(obj)) {
1262 s = PyByteArray_AS_STRING(obj);
1263 len = PyByteArray_GET_SIZE(obj);
1264 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001265 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001266 /* Overwrite the error message with something more useful in
1267 case of a TypeError. */
1268 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001269 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001270 "coercing to str: need string or buffer, "
1271 "%.80s found",
1272 Py_TYPE(obj)->tp_name);
1273 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001274 }
Tim Petersced69f82003-09-16 20:30:58 +00001275
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001276 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001278 Py_INCREF(unicode_empty);
1279 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 }
Tim Petersced69f82003-09-16 20:30:58 +00001281 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001283
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001284 return v;
1285
Benjamin Peterson29060642009-01-31 22:14:21 +00001286 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001287 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288}
1289
1290PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001291 Py_ssize_t size,
1292 const char *encoding,
1293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294{
1295 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001296 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001297 char lower[20]; /* Enough for any encoding name we recognize */
1298 char *l;
1299 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001300
1301 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001302 encoding = PyUnicode_GetDefaultEncoding();
1303
1304 /* Convert encoding to lower case and replace '_' with '-' in order to
1305 catch e.g. UTF_8 */
1306 e = encoding;
1307 l = lower;
1308 while (*e && l < &lower[(sizeof lower) - 2]) {
1309 if (ISUPPER(*e)) {
1310 *l++ = TOLOWER(*e++);
1311 }
1312 else if (*e == '_') {
1313 *l++ = '-';
1314 e++;
1315 }
1316 else {
1317 *l++ = *e++;
1318 }
1319 }
1320 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001321
1322 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001323 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001325 else if ((strcmp(lower, "latin-1") == 0) ||
1326 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001327 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001328#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001329 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001330 return PyUnicode_DecodeMBCS(s, size, errors);
1331#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001332 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001333 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001334 else if (strcmp(lower, "utf-16") == 0)
1335 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1336 else if (strcmp(lower, "utf-32") == 0)
1337 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
1339 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001340 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001341 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001342 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001343 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344 if (buffer == NULL)
1345 goto onError;
1346 unicode = PyCodec_Decode(buffer, encoding, errors);
1347 if (unicode == NULL)
1348 goto onError;
1349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001351 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001352 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353 Py_DECREF(unicode);
1354 goto onError;
1355 }
1356 Py_DECREF(buffer);
1357 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001358
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360 Py_XDECREF(buffer);
1361 return NULL;
1362}
1363
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001364PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1365 const char *encoding,
1366 const char *errors)
1367{
1368 PyObject *v;
1369
1370 if (!PyUnicode_Check(unicode)) {
1371 PyErr_BadArgument();
1372 goto onError;
1373 }
1374
1375 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001377
1378 /* Decode via the codec registry */
1379 v = PyCodec_Decode(unicode, encoding, errors);
1380 if (v == NULL)
1381 goto onError;
1382 return v;
1383
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001385 return NULL;
1386}
1387
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001388PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1389 const char *encoding,
1390 const char *errors)
1391{
1392 PyObject *v;
1393
1394 if (!PyUnicode_Check(unicode)) {
1395 PyErr_BadArgument();
1396 goto onError;
1397 }
1398
1399 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001400 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001401
1402 /* Decode via the codec registry */
1403 v = PyCodec_Decode(unicode, encoding, errors);
1404 if (v == NULL)
1405 goto onError;
1406 if (!PyUnicode_Check(v)) {
1407 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001408 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001409 Py_TYPE(v)->tp_name);
1410 Py_DECREF(v);
1411 goto onError;
1412 }
1413 return v;
1414
Benjamin Peterson29060642009-01-31 22:14:21 +00001415 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001416 return NULL;
1417}
1418
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001420 Py_ssize_t size,
1421 const char *encoding,
1422 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423{
1424 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001425
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 unicode = PyUnicode_FromUnicode(s, size);
1427 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1430 Py_DECREF(unicode);
1431 return v;
1432}
1433
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001434PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1435 const char *encoding,
1436 const char *errors)
1437{
1438 PyObject *v;
1439
1440 if (!PyUnicode_Check(unicode)) {
1441 PyErr_BadArgument();
1442 goto onError;
1443 }
1444
1445 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001446 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001447
1448 /* Encode via the codec registry */
1449 v = PyCodec_Encode(unicode, encoding, errors);
1450 if (v == NULL)
1451 goto onError;
1452 return v;
1453
Benjamin Peterson29060642009-01-31 22:14:21 +00001454 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001455 return NULL;
1456}
1457
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1459 const char *encoding,
1460 const char *errors)
1461{
1462 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464 if (!PyUnicode_Check(unicode)) {
1465 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 }
Fred Drakee4315f52000-05-09 19:53:39 +00001468
Tim Petersced69f82003-09-16 20:30:58 +00001469 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001470 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001471
1472 /* Shortcuts for common default encodings */
1473 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 if (strcmp(encoding, "utf-8") == 0)
1475 return PyUnicode_AsUTF8String(unicode);
1476 else if (strcmp(encoding, "latin-1") == 0)
1477 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001478#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 else if (strcmp(encoding, "mbcs") == 0)
1480 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001481#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 else if (strcmp(encoding, "ascii") == 0)
1483 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001484 /* During bootstrap, we may need to find the encodings
1485 package, to load the file system encoding, and require the
1486 file system encoding in order to load the encodings
1487 package.
1488
1489 Break out of this dependency by assuming that the path to
1490 the encodings module is ASCII-only. XXX could try wcstombs
1491 instead, if the file system encoding is the locale's
1492 encoding. */
1493 else if (Py_FileSystemDefaultEncoding &&
1494 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1495 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498
1499 /* Encode via the codec registry */
1500 v = PyCodec_Encode(unicode, encoding, errors);
1501 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001502 return NULL;
1503
1504 /* The normal path */
1505 if (PyBytes_Check(v))
1506 return v;
1507
1508 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001509 if (PyByteArray_Check(v)) {
1510 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001511 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001512 PyOS_snprintf(msg, sizeof(msg),
1513 "encoder %s returned buffer instead of bytes",
1514 encoding);
1515 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001516 Py_DECREF(v);
1517 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001518 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001519
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001520 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1521 Py_DECREF(v);
1522 return b;
1523 }
1524
1525 PyErr_Format(PyExc_TypeError,
1526 "encoder did not return a bytes object (type=%.400s)",
1527 Py_TYPE(v)->tp_name);
1528 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001529 return NULL;
1530}
1531
1532PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1533 const char *encoding,
1534 const char *errors)
1535{
1536 PyObject *v;
1537
1538 if (!PyUnicode_Check(unicode)) {
1539 PyErr_BadArgument();
1540 goto onError;
1541 }
1542
1543 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001544 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001545
1546 /* Encode via the codec registry */
1547 v = PyCodec_Encode(unicode, encoding, errors);
1548 if (v == NULL)
1549 goto onError;
1550 if (!PyUnicode_Check(v)) {
1551 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001552 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001553 Py_TYPE(v)->tp_name);
1554 Py_DECREF(v);
1555 goto onError;
1556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001558
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560 return NULL;
1561}
1562
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001563PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001564 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001565{
1566 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001567 if (v)
1568 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001569 if (errors != NULL)
1570 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001571 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001572 PyUnicode_GET_SIZE(unicode),
1573 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001574 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001575 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001576 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001577 return v;
1578}
1579
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001580PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001581PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001582 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001583 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1584}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001585
Christian Heimes5894ba72007-11-04 11:43:14 +00001586PyObject*
1587PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1588{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001589 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1590 can be undefined. If it is case, decode using UTF-8. The following assumes
1591 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1592 bootstrapping process where the codecs aren't ready yet.
1593 */
1594 if (Py_FileSystemDefaultEncoding) {
1595#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001596 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001597 return PyUnicode_DecodeMBCS(s, size, "replace");
1598 }
1599#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001600 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001601 return PyUnicode_DecodeUTF8(s, size, "replace");
1602 }
1603#endif
1604 return PyUnicode_Decode(s, size,
1605 Py_FileSystemDefaultEncoding,
1606 "replace");
1607 }
1608 else {
1609 return PyUnicode_DecodeUTF8(s, size, "replace");
1610 }
1611}
1612
Martin v. Löwis011e8422009-05-05 04:43:17 +00001613/* Convert the argument to a bytes object, according to the file
Gregory P. Smithcc47d8c2010-02-27 08:33:11 +00001614 system encoding. The addr param must be a PyObject**.
1615 This is designed to be used with "O&" in PyArg_Parse APIs. */
Martin v. Löwis011e8422009-05-05 04:43:17 +00001616
1617int
1618PyUnicode_FSConverter(PyObject* arg, void* addr)
1619{
1620 PyObject *output = NULL;
1621 Py_ssize_t size;
1622 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001623 if (arg == NULL) {
1624 Py_DECREF(*(PyObject**)addr);
1625 return 1;
1626 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001627 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1628 output = arg;
1629 Py_INCREF(output);
1630 }
1631 else {
1632 arg = PyUnicode_FromObject(arg);
1633 if (!arg)
1634 return 0;
1635 output = PyUnicode_AsEncodedObject(arg,
1636 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001637 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001638 Py_DECREF(arg);
1639 if (!output)
1640 return 0;
1641 if (!PyBytes_Check(output)) {
1642 Py_DECREF(output);
1643 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1644 return 0;
1645 }
1646 }
1647 if (PyBytes_Check(output)) {
1648 size = PyBytes_GET_SIZE(output);
1649 data = PyBytes_AS_STRING(output);
1650 }
1651 else {
1652 size = PyByteArray_GET_SIZE(output);
1653 data = PyByteArray_AS_STRING(output);
1654 }
1655 if (size != strlen(data)) {
1656 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1657 Py_DECREF(output);
1658 return 0;
1659 }
1660 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001661 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001662}
1663
1664
Martin v. Löwis5b222132007-06-10 09:51:05 +00001665char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001666_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001667{
Christian Heimesf3863112007-11-22 07:46:41 +00001668 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001669 if (!PyUnicode_Check(unicode)) {
1670 PyErr_BadArgument();
1671 return NULL;
1672 }
Christian Heimesf3863112007-11-22 07:46:41 +00001673 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1674 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001675 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001676 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001677 *psize = PyBytes_GET_SIZE(bytes);
1678 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001679}
1680
1681char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001682_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001683{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001684 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001685}
1686
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1688{
1689 if (!PyUnicode_Check(unicode)) {
1690 PyErr_BadArgument();
1691 goto onError;
1692 }
1693 return PyUnicode_AS_UNICODE(unicode);
1694
Benjamin Peterson29060642009-01-31 22:14:21 +00001695 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 return NULL;
1697}
1698
Martin v. Löwis18e16552006-02-15 17:27:45 +00001699Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700{
1701 if (!PyUnicode_Check(unicode)) {
1702 PyErr_BadArgument();
1703 goto onError;
1704 }
1705 return PyUnicode_GET_SIZE(unicode);
1706
Benjamin Peterson29060642009-01-31 22:14:21 +00001707 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 return -1;
1709}
1710
Thomas Wouters78890102000-07-22 19:25:51 +00001711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001712{
1713 return unicode_default_encoding;
1714}
1715
1716int PyUnicode_SetDefaultEncoding(const char *encoding)
1717{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001718 if (strcmp(encoding, unicode_default_encoding) != 0) {
1719 PyErr_Format(PyExc_ValueError,
1720 "Can only set default encoding to %s",
1721 unicode_default_encoding);
1722 return -1;
1723 }
Fred Drakee4315f52000-05-09 19:53:39 +00001724 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001725}
1726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727/* error handling callback helper:
1728 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001729 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001730 and adjust various state variables.
1731 return 0 on success, -1 on error
1732*/
1733
1734static
1735int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001736 const char *encoding, const char *reason,
1737 const char **input, const char **inend, Py_ssize_t *startinpos,
1738 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1739 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001740{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001741 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742
1743 PyObject *restuple = NULL;
1744 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001745 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001746 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001747 Py_ssize_t requiredsize;
1748 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001750 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001751 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 int res = -1;
1753
1754 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001755 *errorHandler = PyCodec_LookupError(errors);
1756 if (*errorHandler == NULL)
1757 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001758 }
1759
1760 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001761 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001762 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1763 if (*exceptionObject == NULL)
1764 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 }
1766 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1768 goto onError;
1769 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1770 goto onError;
1771 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1772 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 }
1774
1775 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1776 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001778 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001779 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001780 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001781 }
1782 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001784
1785 /* Copy back the bytes variables, which might have been modified by the
1786 callback */
1787 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1788 if (!inputobj)
1789 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001790 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001792 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001793 *input = PyBytes_AS_STRING(inputobj);
1794 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001795 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001796 /* we can DECREF safely, as the exception has another reference,
1797 so the object won't go away. */
1798 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001802 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001803 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1804 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806
1807 /* need more space? (at least enough for what we
1808 have+the replacement+the rest of the string (starting
1809 at the new input position), so we won't have to check space
1810 when there are no errors in the rest of the string) */
1811 repptr = PyUnicode_AS_UNICODE(repunicode);
1812 repsize = PyUnicode_GET_SIZE(repunicode);
1813 requiredsize = *outpos + repsize + insize-newpos;
1814 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001815 if (requiredsize<2*outsize)
1816 requiredsize = 2*outsize;
1817 if (_PyUnicode_Resize(output, requiredsize) < 0)
1818 goto onError;
1819 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820 }
1821 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001822 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 Py_UNICODE_COPY(*outptr, repptr, repsize);
1824 *outptr += repsize;
1825 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001826
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 /* we made it! */
1828 res = 0;
1829
Benjamin Peterson29060642009-01-31 22:14:21 +00001830 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001831 Py_XDECREF(restuple);
1832 return res;
1833}
1834
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001835/* --- UTF-7 Codec -------------------------------------------------------- */
1836
Antoine Pitrou244651a2009-05-04 18:56:13 +00001837/* See RFC2152 for details. We encode conservatively and decode liberally. */
1838
1839/* Three simple macros defining base-64. */
1840
1841/* Is c a base-64 character? */
1842
1843#define IS_BASE64(c) \
1844 (((c) >= 'A' && (c) <= 'Z') || \
1845 ((c) >= 'a' && (c) <= 'z') || \
1846 ((c) >= '0' && (c) <= '9') || \
1847 (c) == '+' || (c) == '/')
1848
1849/* given that c is a base-64 character, what is its base-64 value? */
1850
1851#define FROM_BASE64(c) \
1852 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1853 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1854 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1855 (c) == '+' ? 62 : 63)
1856
1857/* What is the base-64 character of the bottom 6 bits of n? */
1858
1859#define TO_BASE64(n) \
1860 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1861
1862/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1863 * decoded as itself. We are permissive on decoding; the only ASCII
1864 * byte not decoding to itself is the + which begins a base64
1865 * string. */
1866
1867#define DECODE_DIRECT(c) \
1868 ((c) <= 127 && (c) != '+')
1869
1870/* The UTF-7 encoder treats ASCII characters differently according to
1871 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1872 * the above). See RFC2152. This array identifies these different
1873 * sets:
1874 * 0 : "Set D"
1875 * alphanumeric and '(),-./:?
1876 * 1 : "Set O"
1877 * !"#$%&*;<=>@[]^_`{|}
1878 * 2 : "whitespace"
1879 * ht nl cr sp
1880 * 3 : special (must be base64 encoded)
1881 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1882 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001883
Tim Petersced69f82003-09-16 20:30:58 +00001884static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001885char utf7_category[128] = {
1886/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1887 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1888/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1889 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1890/* sp ! " # $ % & ' ( ) * + , - . / */
1891 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1892/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1894/* @ A B C D E F G H I J K L M N O */
1895 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1896/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1898/* ` a b c d e f g h i j k l m n o */
1899 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900/* p q r s t u v w x y z { | } ~ del */
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001902};
1903
Antoine Pitrou244651a2009-05-04 18:56:13 +00001904/* ENCODE_DIRECT: this character should be encoded as itself. The
1905 * answer depends on whether we are encoding set O as itself, and also
1906 * on whether we are encoding whitespace as itself. RFC2152 makes it
1907 * clear that the answers to these questions vary between
1908 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001909
Antoine Pitrou244651a2009-05-04 18:56:13 +00001910#define ENCODE_DIRECT(c, directO, directWS) \
1911 ((c) < 128 && (c) > 0 && \
1912 ((utf7_category[(c)] == 0) || \
1913 (directWS && (utf7_category[(c)] == 2)) || \
1914 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001915
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001916PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 Py_ssize_t size,
1918 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001919{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001920 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1921}
1922
Antoine Pitrou244651a2009-05-04 18:56:13 +00001923/* The decoder. The only state we preserve is our read position,
1924 * i.e. how many characters we have consumed. So if we end in the
1925 * middle of a shift sequence we have to back off the read position
1926 * and the output to the beginning of the sequence, otherwise we lose
1927 * all the shift state (seen bits, number of bits seen, high
1928 * surrogate). */
1929
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001930PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001931 Py_ssize_t size,
1932 const char *errors,
1933 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001934{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001936 Py_ssize_t startinpos;
1937 Py_ssize_t endinpos;
1938 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939 const char *e;
1940 PyUnicodeObject *unicode;
1941 Py_UNICODE *p;
1942 const char *errmsg = "";
1943 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001944 Py_UNICODE *shiftOutStart;
1945 unsigned int base64bits = 0;
1946 unsigned long base64buffer = 0;
1947 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 PyObject *errorHandler = NULL;
1949 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001950
1951 unicode = _PyUnicode_New(size);
1952 if (!unicode)
1953 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001954 if (size == 0) {
1955 if (consumed)
1956 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001958 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001959
1960 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001961 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 e = s + size;
1963
1964 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001966 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001967 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968
Antoine Pitrou244651a2009-05-04 18:56:13 +00001969 if (inShift) { /* in a base-64 section */
1970 if (IS_BASE64(ch)) { /* consume a base-64 character */
1971 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1972 base64bits += 6;
1973 s++;
1974 if (base64bits >= 16) {
1975 /* we have enough bits for a UTF-16 value */
1976 Py_UNICODE outCh = (Py_UNICODE)
1977 (base64buffer >> (base64bits-16));
1978 base64bits -= 16;
1979 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1980 if (surrogate) {
1981 /* expecting a second surrogate */
1982 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1983#ifdef Py_UNICODE_WIDE
1984 *p++ = (((surrogate & 0x3FF)<<10)
1985 | (outCh & 0x3FF)) + 0x10000;
1986#else
1987 *p++ = surrogate;
1988 *p++ = outCh;
1989#endif
1990 surrogate = 0;
1991 }
1992 else {
1993 surrogate = 0;
1994 errmsg = "second surrogate missing";
1995 goto utf7Error;
1996 }
1997 }
1998 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1999 /* first surrogate */
2000 surrogate = outCh;
2001 }
2002 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2003 errmsg = "unexpected second surrogate";
2004 goto utf7Error;
2005 }
2006 else {
2007 *p++ = outCh;
2008 }
2009 }
2010 }
2011 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002012 inShift = 0;
2013 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002014 if (surrogate) {
2015 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002016 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002017 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 if (base64bits > 0) { /* left-over bits */
2019 if (base64bits >= 6) {
2020 /* We've seen at least one base-64 character */
2021 errmsg = "partial character in shift sequence";
2022 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 else {
2025 /* Some bits remain; they should be zero */
2026 if (base64buffer != 0) {
2027 errmsg = "non-zero padding bits in shift sequence";
2028 goto utf7Error;
2029 }
2030 }
2031 }
2032 if (ch != '-') {
2033 /* '-' is absorbed; other terminating
2034 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002035 *p++ = ch;
2036 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037 }
2038 }
2039 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002040 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002041 s++; /* consume '+' */
2042 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002043 s++;
2044 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002045 }
2046 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002047 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002048 shiftOutStart = p;
2049 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002050 }
2051 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002052 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002053 *p++ = ch;
2054 s++;
2055 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002056 else {
2057 startinpos = s-starts;
2058 s++;
2059 errmsg = "unexpected special character";
2060 goto utf7Error;
2061 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002062 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002063utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 outpos = p-PyUnicode_AS_UNICODE(unicode);
2065 endinpos = s-starts;
2066 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002067 errors, &errorHandler,
2068 "utf7", errmsg,
2069 &starts, &e, &startinpos, &endinpos, &exc, &s,
2070 &unicode, &outpos, &p))
2071 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072 }
2073
Antoine Pitrou244651a2009-05-04 18:56:13 +00002074 /* end of string */
2075
2076 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2077 /* if we're in an inconsistent state, that's an error */
2078 if (surrogate ||
2079 (base64bits >= 6) ||
2080 (base64bits > 0 && base64buffer != 0)) {
2081 outpos = p-PyUnicode_AS_UNICODE(unicode);
2082 endinpos = size;
2083 if (unicode_decode_call_errorhandler(
2084 errors, &errorHandler,
2085 "utf7", "unterminated shift sequence",
2086 &starts, &e, &startinpos, &endinpos, &exc, &s,
2087 &unicode, &outpos, &p))
2088 goto onError;
2089 if (s < e)
2090 goto restart;
2091 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002092 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002093
2094 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002095 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002096 if (inShift) {
2097 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002098 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002099 }
2100 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002101 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002102 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002103 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002104
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002105 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002106 goto onError;
2107
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002108 Py_XDECREF(errorHandler);
2109 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002110 return (PyObject *)unicode;
2111
Benjamin Peterson29060642009-01-31 22:14:21 +00002112 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 Py_XDECREF(errorHandler);
2114 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002115 Py_DECREF(unicode);
2116 return NULL;
2117}
2118
2119
2120PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002121 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002122 int base64SetO,
2123 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002125{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002126 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002127 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002128 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002130 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002131 unsigned int base64bits = 0;
2132 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 char * out;
2134 char * start;
2135
2136 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002138
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002139 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002140 return PyErr_NoMemory();
2141
Antoine Pitrou244651a2009-05-04 18:56:13 +00002142 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143 if (v == NULL)
2144 return NULL;
2145
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002146 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147 for (;i < size; ++i) {
2148 Py_UNICODE ch = s[i];
2149
Antoine Pitrou244651a2009-05-04 18:56:13 +00002150 if (inShift) {
2151 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2152 /* shifting out */
2153 if (base64bits) { /* output remaining bits */
2154 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2155 base64buffer = 0;
2156 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002157 }
2158 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002159 /* Characters not in the BASE64 set implicitly unshift the sequence
2160 so no '-' is required, except if the character is itself a '-' */
2161 if (IS_BASE64(ch) || ch == '-') {
2162 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002164 *out++ = (char) ch;
2165 }
2166 else {
2167 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002168 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002170 else { /* not in a shift sequence */
2171 if (ch == '+') {
2172 *out++ = '+';
2173 *out++ = '-';
2174 }
2175 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2176 *out++ = (char) ch;
2177 }
2178 else {
2179 *out++ = '+';
2180 inShift = 1;
2181 goto encode_char;
2182 }
2183 }
2184 continue;
2185encode_char:
2186#ifdef Py_UNICODE_WIDE
2187 if (ch >= 0x10000) {
2188 /* code first surrogate */
2189 base64bits += 16;
2190 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2191 while (base64bits >= 6) {
2192 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2193 base64bits -= 6;
2194 }
2195 /* prepare second surrogate */
2196 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2197 }
2198#endif
2199 base64bits += 16;
2200 base64buffer = (base64buffer << 16) | ch;
2201 while (base64bits >= 6) {
2202 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2203 base64bits -= 6;
2204 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002205 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002206 if (base64bits)
2207 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2208 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002210 if (_PyBytes_Resize(&v, out - start) < 0)
2211 return NULL;
2212 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213}
2214
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215#undef IS_BASE64
2216#undef FROM_BASE64
2217#undef TO_BASE64
2218#undef DECODE_DIRECT
2219#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221/* --- UTF-8 Codec -------------------------------------------------------- */
2222
Tim Petersced69f82003-09-16 20:30:58 +00002223static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224char utf8_code_length[256] = {
2225 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2226 illegal prefix. see RFC 2279 for details */
2227 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2228 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2229 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2239 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2240 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2241 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2242 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2243};
2244
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002246 Py_ssize_t size,
2247 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248{
Walter Dörwald69652032004-09-07 20:24:22 +00002249 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2250}
2251
Antoine Pitrouab868312009-01-10 15:40:25 +00002252/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2253#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2254
2255/* Mask to quickly check whether a C 'long' contains a
2256 non-ASCII, UTF8-encoded char. */
2257#if (SIZEOF_LONG == 8)
2258# define ASCII_CHAR_MASK 0x8080808080808080L
2259#elif (SIZEOF_LONG == 4)
2260# define ASCII_CHAR_MASK 0x80808080L
2261#else
2262# error C 'long' size should be either 4 or 8!
2263#endif
2264
Walter Dörwald69652032004-09-07 20:24:22 +00002265PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002266 Py_ssize_t size,
2267 const char *errors,
2268 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002269{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002272 Py_ssize_t startinpos;
2273 Py_ssize_t endinpos;
2274 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002275 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 PyUnicodeObject *unicode;
2277 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002278 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002279 PyObject *errorHandler = NULL;
2280 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281
2282 /* Note: size will always be longer than the resulting Unicode
2283 character count */
2284 unicode = _PyUnicode_New(size);
2285 if (!unicode)
2286 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002287 if (size == 0) {
2288 if (consumed)
2289 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 /* Unpack UTF-8 encoded data */
2294 p = unicode->str;
2295 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002296 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297
2298 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002299 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300
2301 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002302 /* Fast path for runs of ASCII characters. Given that common UTF-8
2303 input will consist of an overwhelming majority of ASCII
2304 characters, we try to optimize for this case by checking
2305 as many characters as a C 'long' can contain.
2306 First, check if we can do an aligned read, as most CPUs have
2307 a penalty for unaligned reads.
2308 */
2309 if (!((size_t) s & LONG_PTR_MASK)) {
2310 /* Help register allocation */
2311 register const char *_s = s;
2312 register Py_UNICODE *_p = p;
2313 while (_s < aligned_end) {
2314 /* Read a whole long at a time (either 4 or 8 bytes),
2315 and do a fast unrolled copy if it only contains ASCII
2316 characters. */
2317 unsigned long data = *(unsigned long *) _s;
2318 if (data & ASCII_CHAR_MASK)
2319 break;
2320 _p[0] = (unsigned char) _s[0];
2321 _p[1] = (unsigned char) _s[1];
2322 _p[2] = (unsigned char) _s[2];
2323 _p[3] = (unsigned char) _s[3];
2324#if (SIZEOF_LONG == 8)
2325 _p[4] = (unsigned char) _s[4];
2326 _p[5] = (unsigned char) _s[5];
2327 _p[6] = (unsigned char) _s[6];
2328 _p[7] = (unsigned char) _s[7];
2329#endif
2330 _s += SIZEOF_LONG;
2331 _p += SIZEOF_LONG;
2332 }
2333 s = _s;
2334 p = _p;
2335 if (s == e)
2336 break;
2337 ch = (unsigned char)*s;
2338 }
2339 }
2340
2341 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002342 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 s++;
2344 continue;
2345 }
2346
2347 n = utf8_code_length[ch];
2348
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002349 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002350 if (consumed)
2351 break;
2352 else {
2353 errmsg = "unexpected end of data";
2354 startinpos = s-starts;
2355 endinpos = size;
2356 goto utf8Error;
2357 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002359
2360 switch (n) {
2361
2362 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002363 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 startinpos = s-starts;
2365 endinpos = startinpos+1;
2366 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367
2368 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002369 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 startinpos = s-starts;
2371 endinpos = startinpos+1;
2372 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373
2374 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002375 if ((s[1] & 0xc0) != 0x80) {
2376 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002377 startinpos = s-starts;
2378 endinpos = startinpos+2;
2379 goto utf8Error;
2380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002382 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002383 startinpos = s-starts;
2384 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002385 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002386 goto utf8Error;
2387 }
2388 else
2389 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 break;
2391
2392 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002393 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002394 (s[2] & 0xc0) != 0x80) {
2395 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002396 startinpos = s-starts;
2397 endinpos = startinpos+3;
2398 goto utf8Error;
2399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002401 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002402 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002403 startinpos = s-starts;
2404 endinpos = startinpos+3;
2405 goto utf8Error;
2406 }
2407 else
2408 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002409 break;
2410
2411 case 4:
2412 if ((s[1] & 0xc0) != 0x80 ||
2413 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002414 (s[3] & 0xc0) != 0x80) {
2415 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 startinpos = s-starts;
2417 endinpos = startinpos+4;
2418 goto utf8Error;
2419 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002420 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002422 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002423 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002424 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002425 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002426 UTF-16 */
2427 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002428 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002429 startinpos = s-starts;
2430 endinpos = startinpos+4;
2431 goto utf8Error;
2432 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002433#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002434 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002435#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002436 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002437
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002438 /* translate from 10000..10FFFF to 0..FFFF */
2439 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002440
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002441 /* high surrogate = top 10 bits added to D800 */
2442 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002443
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002444 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002445 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002446#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 break;
2448
2449 default:
2450 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002451 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 startinpos = s-starts;
2453 endinpos = startinpos+n;
2454 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 }
2456 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002457 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002458
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 utf8Error:
2460 outpos = p-PyUnicode_AS_UNICODE(unicode);
2461 if (unicode_decode_call_errorhandler(
2462 errors, &errorHandler,
2463 "utf8", errmsg,
2464 &starts, &e, &startinpos, &endinpos, &exc, &s,
2465 &unicode, &outpos, &p))
2466 goto onError;
2467 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 }
Walter Dörwald69652032004-09-07 20:24:22 +00002469 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002470 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471
2472 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002473 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 goto onError;
2475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 Py_XDECREF(errorHandler);
2477 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 return (PyObject *)unicode;
2479
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002481 Py_XDECREF(errorHandler);
2482 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483 Py_DECREF(unicode);
2484 return NULL;
2485}
2486
Antoine Pitrouab868312009-01-10 15:40:25 +00002487#undef ASCII_CHAR_MASK
2488
2489
Tim Peters602f7402002-04-27 18:03:26 +00002490/* Allocation strategy: if the string is short, convert into a stack buffer
2491 and allocate exactly as much space needed at the end. Else allocate the
2492 maximum possible needed (4 result bytes per Unicode character), and return
2493 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002494*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002495PyObject *
2496PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002497 Py_ssize_t size,
2498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499{
Tim Peters602f7402002-04-27 18:03:26 +00002500#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002501
Guido van Rossum98297ee2007-11-06 21:34:58 +00002502 Py_ssize_t i; /* index into s of next input byte */
2503 PyObject *result; /* result string object */
2504 char *p; /* next free byte in output buffer */
2505 Py_ssize_t nallocated; /* number of result bytes allocated */
2506 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002507 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002508 PyObject *errorHandler = NULL;
2509 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002510
Tim Peters602f7402002-04-27 18:03:26 +00002511 assert(s != NULL);
2512 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513
Tim Peters602f7402002-04-27 18:03:26 +00002514 if (size <= MAX_SHORT_UNICHARS) {
2515 /* Write into the stack buffer; nallocated can't overflow.
2516 * At the end, we'll allocate exactly as much heap space as it
2517 * turns out we need.
2518 */
2519 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002520 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002521 p = stackbuf;
2522 }
2523 else {
2524 /* Overallocate on the heap, and give the excess back at the end. */
2525 nallocated = size * 4;
2526 if (nallocated / 4 != size) /* overflow! */
2527 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002528 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002529 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002530 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002531 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002532 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002533
Tim Peters602f7402002-04-27 18:03:26 +00002534 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002535 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002536
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002537 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002538 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002540
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002542 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002543 *p++ = (char)(0xc0 | (ch >> 6));
2544 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002545 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002546 else {
Tim Peters602f7402002-04-27 18:03:26 +00002547 /* Encode UCS2 Unicode ordinals */
2548 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002549#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002550 /* Special case: check for high surrogate */
2551 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2552 Py_UCS4 ch2 = s[i];
2553 /* Check for low surrogate and combine the two to
2554 form a UCS4 value */
2555 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002556 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002557 i++;
2558 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002559 }
Tim Peters602f7402002-04-27 18:03:26 +00002560 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002561 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002562#endif
2563 if (ch >= 0xd800 && ch <= 0xdfff) {
2564 Py_ssize_t newpos;
2565 PyObject *rep;
2566 char *prep;
2567 int k;
2568 rep = unicode_encode_call_errorhandler
2569 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2570 s, size, &exc, i-1, i, &newpos);
2571 if (!rep)
2572 goto error;
2573 /* Implementation limitations: only support error handler that return
2574 bytes, and only support up to four replacement bytes. */
2575 if (!PyBytes_Check(rep)) {
2576 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2577 Py_DECREF(rep);
2578 goto error;
2579 }
2580 if (PyBytes_Size(rep) > 4) {
2581 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2582 Py_DECREF(rep);
2583 goto error;
2584 }
2585 prep = PyBytes_AsString(rep);
2586 for(k = PyBytes_Size(rep); k > 0; k--)
2587 *p++ = *prep++;
2588 Py_DECREF(rep);
2589 continue;
2590
2591 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002592 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002593 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2594 *p++ = (char)(0x80 | (ch & 0x3f));
2595 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002596 }
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002597#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002598 encodeUCS4:
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002599#endif
Tim Peters602f7402002-04-27 18:03:26 +00002600 /* Encode UCS4 Unicode ordinals */
2601 *p++ = (char)(0xf0 | (ch >> 18));
2602 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2603 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2604 *p++ = (char)(0x80 | (ch & 0x3f));
2605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002607
Guido van Rossum98297ee2007-11-06 21:34:58 +00002608 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002609 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002610 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002611 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002612 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002613 }
2614 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002615 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002616 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002617 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002618 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002619 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002620 Py_XDECREF(errorHandler);
2621 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002622 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002623 error:
2624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
2626 Py_XDECREF(result);
2627 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002628
Tim Peters602f7402002-04-27 18:03:26 +00002629#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630}
2631
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2633{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 if (!PyUnicode_Check(unicode)) {
2635 PyErr_BadArgument();
2636 return NULL;
2637 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002638 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002639 PyUnicode_GET_SIZE(unicode),
2640 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641}
2642
Walter Dörwald41980ca2007-08-16 21:55:45 +00002643/* --- UTF-32 Codec ------------------------------------------------------- */
2644
2645PyObject *
2646PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002647 Py_ssize_t size,
2648 const char *errors,
2649 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002650{
2651 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2652}
2653
2654PyObject *
2655PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002656 Py_ssize_t size,
2657 const char *errors,
2658 int *byteorder,
2659 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002660{
2661 const char *starts = s;
2662 Py_ssize_t startinpos;
2663 Py_ssize_t endinpos;
2664 Py_ssize_t outpos;
2665 PyUnicodeObject *unicode;
2666 Py_UNICODE *p;
2667#ifndef Py_UNICODE_WIDE
2668 int i, pairs;
2669#else
2670 const int pairs = 0;
2671#endif
2672 const unsigned char *q, *e;
2673 int bo = 0; /* assume native ordering by default */
2674 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002675 /* Offsets from q for retrieving bytes in the right order. */
2676#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2677 int iorder[] = {0, 1, 2, 3};
2678#else
2679 int iorder[] = {3, 2, 1, 0};
2680#endif
2681 PyObject *errorHandler = NULL;
2682 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002683 /* On narrow builds we split characters outside the BMP into two
2684 codepoints => count how much extra space we need. */
2685#ifndef Py_UNICODE_WIDE
2686 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 if (((Py_UCS4 *)s)[i] >= 0x10000)
2688 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002689#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002690
2691 /* This might be one to much, because of a BOM */
2692 unicode = _PyUnicode_New((size+3)/4+pairs);
2693 if (!unicode)
2694 return NULL;
2695 if (size == 0)
2696 return (PyObject *)unicode;
2697
2698 /* Unpack UTF-32 encoded data */
2699 p = unicode->str;
2700 q = (unsigned char *)s;
2701 e = q + size;
2702
2703 if (byteorder)
2704 bo = *byteorder;
2705
2706 /* Check for BOM marks (U+FEFF) in the input and adjust current
2707 byte order setting accordingly. In native mode, the leading BOM
2708 mark is skipped, in all other modes, it is copied to the output
2709 stream as-is (giving a ZWNBSP character). */
2710 if (bo == 0) {
2711 if (size >= 4) {
2712 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002713 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002714#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002715 if (bom == 0x0000FEFF) {
2716 q += 4;
2717 bo = -1;
2718 }
2719 else if (bom == 0xFFFE0000) {
2720 q += 4;
2721 bo = 1;
2722 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002723#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 if (bom == 0x0000FEFF) {
2725 q += 4;
2726 bo = 1;
2727 }
2728 else if (bom == 0xFFFE0000) {
2729 q += 4;
2730 bo = -1;
2731 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002732#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002734 }
2735
2736 if (bo == -1) {
2737 /* force LE */
2738 iorder[0] = 0;
2739 iorder[1] = 1;
2740 iorder[2] = 2;
2741 iorder[3] = 3;
2742 }
2743 else if (bo == 1) {
2744 /* force BE */
2745 iorder[0] = 3;
2746 iorder[1] = 2;
2747 iorder[2] = 1;
2748 iorder[3] = 0;
2749 }
2750
2751 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002752 Py_UCS4 ch;
2753 /* remaining bytes at the end? (size should be divisible by 4) */
2754 if (e-q<4) {
2755 if (consumed)
2756 break;
2757 errmsg = "truncated data";
2758 startinpos = ((const char *)q)-starts;
2759 endinpos = ((const char *)e)-starts;
2760 goto utf32Error;
2761 /* The remaining input chars are ignored if the callback
2762 chooses to skip the input */
2763 }
2764 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2765 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002766
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 if (ch >= 0x110000)
2768 {
2769 errmsg = "codepoint not in range(0x110000)";
2770 startinpos = ((const char *)q)-starts;
2771 endinpos = startinpos+4;
2772 goto utf32Error;
2773 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002774#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002775 if (ch >= 0x10000)
2776 {
2777 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2778 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2779 }
2780 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002781#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 *p++ = ch;
2783 q += 4;
2784 continue;
2785 utf32Error:
2786 outpos = p-PyUnicode_AS_UNICODE(unicode);
2787 if (unicode_decode_call_errorhandler(
2788 errors, &errorHandler,
2789 "utf32", errmsg,
2790 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2791 &unicode, &outpos, &p))
2792 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002793 }
2794
2795 if (byteorder)
2796 *byteorder = bo;
2797
2798 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002800
2801 /* Adjust length */
2802 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2803 goto onError;
2804
2805 Py_XDECREF(errorHandler);
2806 Py_XDECREF(exc);
2807 return (PyObject *)unicode;
2808
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002810 Py_DECREF(unicode);
2811 Py_XDECREF(errorHandler);
2812 Py_XDECREF(exc);
2813 return NULL;
2814}
2815
2816PyObject *
2817PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 Py_ssize_t size,
2819 const char *errors,
2820 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002821{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002822 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002823 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002824 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002825#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002826 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002827#else
2828 const int pairs = 0;
2829#endif
2830 /* Offsets from p for storing byte pairs in the right order. */
2831#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2832 int iorder[] = {0, 1, 2, 3};
2833#else
2834 int iorder[] = {3, 2, 1, 0};
2835#endif
2836
Benjamin Peterson29060642009-01-31 22:14:21 +00002837#define STORECHAR(CH) \
2838 do { \
2839 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2840 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2841 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2842 p[iorder[0]] = (CH) & 0xff; \
2843 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002844 } while(0)
2845
2846 /* In narrow builds we can output surrogate pairs as one codepoint,
2847 so we need less space. */
2848#ifndef Py_UNICODE_WIDE
2849 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002850 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2851 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2852 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002853#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002854 nsize = (size - pairs + (byteorder == 0));
2855 bytesize = nsize * 4;
2856 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002857 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002858 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002859 if (v == NULL)
2860 return NULL;
2861
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002862 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002863 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002865 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002866 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002867
2868 if (byteorder == -1) {
2869 /* force LE */
2870 iorder[0] = 0;
2871 iorder[1] = 1;
2872 iorder[2] = 2;
2873 iorder[3] = 3;
2874 }
2875 else if (byteorder == 1) {
2876 /* force BE */
2877 iorder[0] = 3;
2878 iorder[1] = 2;
2879 iorder[2] = 1;
2880 iorder[3] = 0;
2881 }
2882
2883 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002885#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002886 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2887 Py_UCS4 ch2 = *s;
2888 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2889 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2890 s++;
2891 size--;
2892 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002893 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002894#endif
2895 STORECHAR(ch);
2896 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002897
2898 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002899 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002900#undef STORECHAR
2901}
2902
2903PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2904{
2905 if (!PyUnicode_Check(unicode)) {
2906 PyErr_BadArgument();
2907 return NULL;
2908 }
2909 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002910 PyUnicode_GET_SIZE(unicode),
2911 NULL,
2912 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002913}
2914
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915/* --- UTF-16 Codec ------------------------------------------------------- */
2916
Tim Peters772747b2001-08-09 22:21:55 +00002917PyObject *
2918PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 Py_ssize_t size,
2920 const char *errors,
2921 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922{
Walter Dörwald69652032004-09-07 20:24:22 +00002923 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2924}
2925
Antoine Pitrouab868312009-01-10 15:40:25 +00002926/* Two masks for fast checking of whether a C 'long' may contain
2927 UTF16-encoded surrogate characters. This is an efficient heuristic,
2928 assuming that non-surrogate characters with a code point >= 0x8000 are
2929 rare in most input.
2930 FAST_CHAR_MASK is used when the input is in native byte ordering,
2931 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002932*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002933#if (SIZEOF_LONG == 8)
2934# define FAST_CHAR_MASK 0x8000800080008000L
2935# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2936#elif (SIZEOF_LONG == 4)
2937# define FAST_CHAR_MASK 0x80008000L
2938# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2939#else
2940# error C 'long' size should be either 4 or 8!
2941#endif
2942
Walter Dörwald69652032004-09-07 20:24:22 +00002943PyObject *
2944PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002945 Py_ssize_t size,
2946 const char *errors,
2947 int *byteorder,
2948 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002949{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002951 Py_ssize_t startinpos;
2952 Py_ssize_t endinpos;
2953 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 PyUnicodeObject *unicode;
2955 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002956 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002957 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002958 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002959 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002960 /* Offsets from q for retrieving byte pairs in the right order. */
2961#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2962 int ihi = 1, ilo = 0;
2963#else
2964 int ihi = 0, ilo = 1;
2965#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002966 PyObject *errorHandler = NULL;
2967 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 /* Note: size will always be longer than the resulting Unicode
2970 character count */
2971 unicode = _PyUnicode_New(size);
2972 if (!unicode)
2973 return NULL;
2974 if (size == 0)
2975 return (PyObject *)unicode;
2976
2977 /* Unpack UTF-16 encoded data */
2978 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002979 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002980 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981
2982 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002983 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002985 /* Check for BOM marks (U+FEFF) in the input and adjust current
2986 byte order setting accordingly. In native mode, the leading BOM
2987 mark is skipped, in all other modes, it is copied to the output
2988 stream as-is (giving a ZWNBSP character). */
2989 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002990 if (size >= 2) {
2991 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002992#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 if (bom == 0xFEFF) {
2994 q += 2;
2995 bo = -1;
2996 }
2997 else if (bom == 0xFFFE) {
2998 q += 2;
2999 bo = 1;
3000 }
Tim Petersced69f82003-09-16 20:30:58 +00003001#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 if (bom == 0xFEFF) {
3003 q += 2;
3004 bo = 1;
3005 }
3006 else if (bom == 0xFFFE) {
3007 q += 2;
3008 bo = -1;
3009 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003010#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013
Tim Peters772747b2001-08-09 22:21:55 +00003014 if (bo == -1) {
3015 /* force LE */
3016 ihi = 1;
3017 ilo = 0;
3018 }
3019 else if (bo == 1) {
3020 /* force BE */
3021 ihi = 0;
3022 ilo = 1;
3023 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003024#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3025 native_ordering = ilo < ihi;
3026#else
3027 native_ordering = ilo > ihi;
3028#endif
Tim Peters772747b2001-08-09 22:21:55 +00003029
Antoine Pitrouab868312009-01-10 15:40:25 +00003030 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003031 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003033 /* First check for possible aligned read of a C 'long'. Unaligned
3034 reads are more expensive, better to defer to another iteration. */
3035 if (!((size_t) q & LONG_PTR_MASK)) {
3036 /* Fast path for runs of non-surrogate chars. */
3037 register const unsigned char *_q = q;
3038 Py_UNICODE *_p = p;
3039 if (native_ordering) {
3040 /* Native ordering is simple: as long as the input cannot
3041 possibly contain a surrogate char, do an unrolled copy
3042 of several 16-bit code points to the target object.
3043 The non-surrogate check is done on several input bytes
3044 at a time (as many as a C 'long' can contain). */
3045 while (_q < aligned_end) {
3046 unsigned long data = * (unsigned long *) _q;
3047 if (data & FAST_CHAR_MASK)
3048 break;
3049 _p[0] = ((unsigned short *) _q)[0];
3050 _p[1] = ((unsigned short *) _q)[1];
3051#if (SIZEOF_LONG == 8)
3052 _p[2] = ((unsigned short *) _q)[2];
3053 _p[3] = ((unsigned short *) _q)[3];
3054#endif
3055 _q += SIZEOF_LONG;
3056 _p += SIZEOF_LONG / 2;
3057 }
3058 }
3059 else {
3060 /* Byteswapped ordering is similar, but we must decompose
3061 the copy bytewise, and take care of zero'ing out the
3062 upper bytes if the target object is in 32-bit units
3063 (that is, in UCS-4 builds). */
3064 while (_q < aligned_end) {
3065 unsigned long data = * (unsigned long *) _q;
3066 if (data & SWAPPED_FAST_CHAR_MASK)
3067 break;
3068 /* Zero upper bytes in UCS-4 builds */
3069#if (Py_UNICODE_SIZE > 2)
3070 _p[0] = 0;
3071 _p[1] = 0;
3072#if (SIZEOF_LONG == 8)
3073 _p[2] = 0;
3074 _p[3] = 0;
3075#endif
3076#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003077 /* Issue #4916; UCS-4 builds on big endian machines must
3078 fill the two last bytes of each 4-byte unit. */
3079#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3080# define OFF 2
3081#else
3082# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003083#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003084 ((unsigned char *) _p)[OFF + 1] = _q[0];
3085 ((unsigned char *) _p)[OFF + 0] = _q[1];
3086 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3087 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3088#if (SIZEOF_LONG == 8)
3089 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3090 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3091 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3092 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3093#endif
3094#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003095 _q += SIZEOF_LONG;
3096 _p += SIZEOF_LONG / 2;
3097 }
3098 }
3099 p = _p;
3100 q = _q;
3101 if (q >= e)
3102 break;
3103 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003104 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003105
Benjamin Peterson14339b62009-01-31 16:36:08 +00003106 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003107
3108 if (ch < 0xD800 || ch > 0xDFFF) {
3109 *p++ = ch;
3110 continue;
3111 }
3112
3113 /* UTF-16 code pair: */
3114 if (q > e) {
3115 errmsg = "unexpected end of data";
3116 startinpos = (((const char *)q) - 2) - starts;
3117 endinpos = ((const char *)e) + 1 - starts;
3118 goto utf16Error;
3119 }
3120 if (0xD800 <= ch && ch <= 0xDBFF) {
3121 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3122 q += 2;
3123 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003124#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 *p++ = ch;
3126 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003127#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003128 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003129#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 continue;
3131 }
3132 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003133 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003134 startinpos = (((const char *)q)-4)-starts;
3135 endinpos = startinpos+2;
3136 goto utf16Error;
3137 }
3138
Benjamin Peterson14339b62009-01-31 16:36:08 +00003139 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003140 errmsg = "illegal encoding";
3141 startinpos = (((const char *)q)-2)-starts;
3142 endinpos = startinpos+2;
3143 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003144
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 utf16Error:
3146 outpos = p - PyUnicode_AS_UNICODE(unicode);
3147 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003148 errors,
3149 &errorHandler,
3150 "utf16", errmsg,
3151 &starts,
3152 (const char **)&e,
3153 &startinpos,
3154 &endinpos,
3155 &exc,
3156 (const char **)&q,
3157 &unicode,
3158 &outpos,
3159 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003162 /* remaining byte at the end? (size should be even) */
3163 if (e == q) {
3164 if (!consumed) {
3165 errmsg = "truncated data";
3166 startinpos = ((const char *)q) - starts;
3167 endinpos = ((const char *)e) + 1 - starts;
3168 outpos = p - PyUnicode_AS_UNICODE(unicode);
3169 if (unicode_decode_call_errorhandler(
3170 errors,
3171 &errorHandler,
3172 "utf16", errmsg,
3173 &starts,
3174 (const char **)&e,
3175 &startinpos,
3176 &endinpos,
3177 &exc,
3178 (const char **)&q,
3179 &unicode,
3180 &outpos,
3181 &p))
3182 goto onError;
3183 /* The remaining input chars are ignored if the callback
3184 chooses to skip the input */
3185 }
3186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187
3188 if (byteorder)
3189 *byteorder = bo;
3190
Walter Dörwald69652032004-09-07 20:24:22 +00003191 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003193
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003195 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 goto onError;
3197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003198 Py_XDECREF(errorHandler);
3199 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 return (PyObject *)unicode;
3201
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204 Py_XDECREF(errorHandler);
3205 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 return NULL;
3207}
3208
Antoine Pitrouab868312009-01-10 15:40:25 +00003209#undef FAST_CHAR_MASK
3210#undef SWAPPED_FAST_CHAR_MASK
3211
Tim Peters772747b2001-08-09 22:21:55 +00003212PyObject *
3213PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 Py_ssize_t size,
3215 const char *errors,
3216 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003218 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003219 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003220 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003221#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003222 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003223#else
3224 const int pairs = 0;
3225#endif
Tim Peters772747b2001-08-09 22:21:55 +00003226 /* Offsets from p for storing byte pairs in the right order. */
3227#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3228 int ihi = 1, ilo = 0;
3229#else
3230 int ihi = 0, ilo = 1;
3231#endif
3232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233#define STORECHAR(CH) \
3234 do { \
3235 p[ihi] = ((CH) >> 8) & 0xff; \
3236 p[ilo] = (CH) & 0xff; \
3237 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003238 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003240#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003241 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 if (s[i] >= 0x10000)
3243 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003244#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003245 /* 2 * (size + pairs + (byteorder == 0)) */
3246 if (size > PY_SSIZE_T_MAX ||
3247 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003249 nsize = size + pairs + (byteorder == 0);
3250 bytesize = nsize * 2;
3251 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003253 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 if (v == NULL)
3255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003257 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003259 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003260 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003261 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003262
3263 if (byteorder == -1) {
3264 /* force LE */
3265 ihi = 1;
3266 ilo = 0;
3267 }
3268 else if (byteorder == 1) {
3269 /* force BE */
3270 ihi = 0;
3271 ilo = 1;
3272 }
3273
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003274 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 Py_UNICODE ch = *s++;
3276 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003277#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 if (ch >= 0x10000) {
3279 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3280 ch = 0xD800 | ((ch-0x10000) >> 10);
3281 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003282#endif
Tim Peters772747b2001-08-09 22:21:55 +00003283 STORECHAR(ch);
3284 if (ch2)
3285 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003286 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003287
3288 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003289 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003290#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291}
3292
3293PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3294{
3295 if (!PyUnicode_Check(unicode)) {
3296 PyErr_BadArgument();
3297 return NULL;
3298 }
3299 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003300 PyUnicode_GET_SIZE(unicode),
3301 NULL,
3302 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303}
3304
3305/* --- Unicode Escape Codec ----------------------------------------------- */
3306
Fredrik Lundh06d12682001-01-24 07:59:11 +00003307static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003308
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 Py_ssize_t size,
3311 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003314 Py_ssize_t startinpos;
3315 Py_ssize_t endinpos;
3316 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003321 char* message;
3322 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323 PyObject *errorHandler = NULL;
3324 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 /* Escaped strings will always be longer than the resulting
3327 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328 length after conversion to the true value.
3329 (but if the error callback returns a long replacement string
3330 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331 v = _PyUnicode_New(size);
3332 if (v == NULL)
3333 goto onError;
3334 if (size == 0)
3335 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003337 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003339
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 while (s < end) {
3341 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003342 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344
3345 /* Non-escape characters are interpreted as Unicode ordinals */
3346 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003347 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 continue;
3349 }
3350
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 /* \ - Escapes */
3353 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003354 c = *s++;
3355 if (s > end)
3356 c = '\0'; /* Invalid after \ */
3357 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 case '\n': break;
3361 case '\\': *p++ = '\\'; break;
3362 case '\'': *p++ = '\''; break;
3363 case '\"': *p++ = '\"'; break;
3364 case 'b': *p++ = '\b'; break;
3365 case 'f': *p++ = '\014'; break; /* FF */
3366 case 't': *p++ = '\t'; break;
3367 case 'n': *p++ = '\n'; break;
3368 case 'r': *p++ = '\r'; break;
3369 case 'v': *p++ = '\013'; break; /* VT */
3370 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3371
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 case '0': case '1': case '2': case '3':
3374 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003375 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003376 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003377 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003378 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003379 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003381 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382 break;
3383
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 /* hex escapes */
3385 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003386 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003387 digits = 2;
3388 message = "truncated \\xXX escape";
3389 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003393 digits = 4;
3394 message = "truncated \\uXXXX escape";
3395 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396
Benjamin Peterson29060642009-01-31 22:14:21 +00003397 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003398 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003399 digits = 8;
3400 message = "truncated \\UXXXXXXXX escape";
3401 hexescape:
3402 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 outpos = p-PyUnicode_AS_UNICODE(v);
3404 if (s+digits>end) {
3405 endinpos = size;
3406 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003407 errors, &errorHandler,
3408 "unicodeescape", "end of string in escape sequence",
3409 &starts, &end, &startinpos, &endinpos, &exc, &s,
3410 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 goto onError;
3412 goto nextByte;
3413 }
3414 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003415 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003416 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 endinpos = (s+i+1)-starts;
3418 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003419 errors, &errorHandler,
3420 "unicodeescape", message,
3421 &starts, &end, &startinpos, &endinpos, &exc, &s,
3422 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003423 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003424 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003425 }
3426 chr = (chr<<4) & ~0xF;
3427 if (c >= '0' && c <= '9')
3428 chr += c - '0';
3429 else if (c >= 'a' && c <= 'f')
3430 chr += 10 + c - 'a';
3431 else
3432 chr += 10 + c - 'A';
3433 }
3434 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003435 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 /* _decoding_error will have already written into the
3437 target buffer. */
3438 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003439 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003440 /* when we get here, chr is a 32-bit unicode character */
3441 if (chr <= 0xffff)
3442 /* UCS-2 character */
3443 *p++ = (Py_UNICODE) chr;
3444 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003445 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003446 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003447#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003448 *p++ = chr;
3449#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003450 chr -= 0x10000L;
3451 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003452 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003453#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003454 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 endinpos = s-starts;
3456 outpos = p-PyUnicode_AS_UNICODE(v);
3457 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003458 errors, &errorHandler,
3459 "unicodeescape", "illegal Unicode character",
3460 &starts, &end, &startinpos, &endinpos, &exc, &s,
3461 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003462 goto onError;
3463 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003464 break;
3465
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003467 case 'N':
3468 message = "malformed \\N character escape";
3469 if (ucnhash_CAPI == NULL) {
3470 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003471 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003472 if (ucnhash_CAPI == NULL)
3473 goto ucnhashError;
3474 }
3475 if (*s == '{') {
3476 const char *start = s+1;
3477 /* look for the closing brace */
3478 while (*s != '}' && s < end)
3479 s++;
3480 if (s > start && s < end && *s == '}') {
3481 /* found a name. look it up in the unicode database */
3482 message = "unknown Unicode character name";
3483 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003484 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003485 goto store;
3486 }
3487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 endinpos = s-starts;
3489 outpos = p-PyUnicode_AS_UNICODE(v);
3490 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003491 errors, &errorHandler,
3492 "unicodeescape", message,
3493 &starts, &end, &startinpos, &endinpos, &exc, &s,
3494 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003495 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003496 break;
3497
3498 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003499 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 message = "\\ at end of string";
3501 s--;
3502 endinpos = s-starts;
3503 outpos = p-PyUnicode_AS_UNICODE(v);
3504 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003505 errors, &errorHandler,
3506 "unicodeescape", message,
3507 &starts, &end, &startinpos, &endinpos, &exc, &s,
3508 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003509 goto onError;
3510 }
3511 else {
3512 *p++ = '\\';
3513 *p++ = (unsigned char)s[-1];
3514 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003515 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003517 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003520 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003522 Py_XDECREF(errorHandler);
3523 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003525
Benjamin Peterson29060642009-01-31 22:14:21 +00003526 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003527 PyErr_SetString(
3528 PyExc_UnicodeError,
3529 "\\N escapes not supported (can't load unicodedata module)"
3530 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003531 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532 Py_XDECREF(errorHandler);
3533 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003534 return NULL;
3535
Benjamin Peterson29060642009-01-31 22:14:21 +00003536 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 Py_XDECREF(errorHandler);
3539 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 return NULL;
3541}
3542
3543/* Return a Unicode-Escape string version of the Unicode object.
3544
3545 If quotes is true, the string is enclosed in u"" or u'' quotes as
3546 appropriate.
3547
3548*/
3549
Thomas Wouters477c8d52006-05-27 19:21:47 +00003550Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 Py_ssize_t size,
3552 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003553{
3554 /* like wcschr, but doesn't stop at NULL characters */
3555
3556 while (size-- > 0) {
3557 if (*s == ch)
3558 return s;
3559 s++;
3560 }
3561
3562 return NULL;
3563}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003564
Walter Dörwald79e913e2007-05-12 11:08:06 +00003565static const char *hexdigits = "0123456789abcdef";
3566
3567PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003568 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003570 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003573#ifdef Py_UNICODE_WIDE
3574 const Py_ssize_t expandsize = 10;
3575#else
3576 const Py_ssize_t expandsize = 6;
3577#endif
3578
Thomas Wouters89f507f2006-12-13 04:49:30 +00003579 /* XXX(nnorwitz): rather than over-allocating, it would be
3580 better to choose a different scheme. Perhaps scan the
3581 first N-chars of the string and allocate based on that size.
3582 */
3583 /* Initial allocation is based on the longest-possible unichr
3584 escape.
3585
3586 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3587 unichr, so in this case it's the longest unichr escape. In
3588 narrow (UTF-16) builds this is five chars per source unichr
3589 since there are two unichrs in the surrogate pair, so in narrow
3590 (UTF-16) builds it's not the longest unichr escape.
3591
3592 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3593 so in the narrow (UTF-16) build case it's the longest unichr
3594 escape.
3595 */
3596
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003597 if (size == 0)
3598 return PyBytes_FromStringAndSize(NULL, 0);
3599
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003600 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003602
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003603 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003604 2
3605 + expandsize*size
3606 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 if (repr == NULL)
3608 return NULL;
3609
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003610 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 while (size-- > 0) {
3613 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003614
Walter Dörwald79e913e2007-05-12 11:08:06 +00003615 /* Escape backslashes */
3616 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 *p++ = '\\';
3618 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003619 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003620 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003621
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003622#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003623 /* Map 21-bit characters to '\U00xxxxxx' */
3624 else if (ch >= 0x10000) {
3625 *p++ = '\\';
3626 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003627 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3628 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3629 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3630 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3631 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3632 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3633 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3634 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003636 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003637#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3639 else if (ch >= 0xD800 && ch < 0xDC00) {
3640 Py_UNICODE ch2;
3641 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003642
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 ch2 = *s++;
3644 size--;
3645 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3646 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3647 *p++ = '\\';
3648 *p++ = 'U';
3649 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3650 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3651 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3652 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3653 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3654 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3655 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3656 *p++ = hexdigits[ucs & 0x0000000F];
3657 continue;
3658 }
3659 /* Fall through: isolated surrogates are copied as-is */
3660 s--;
3661 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003662 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003663#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003664
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003666 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 *p++ = '\\';
3668 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003669 *p++ = hexdigits[(ch >> 12) & 0x000F];
3670 *p++ = hexdigits[(ch >> 8) & 0x000F];
3671 *p++ = hexdigits[(ch >> 4) & 0x000F];
3672 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003674
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003675 /* Map special whitespace to '\t', \n', '\r' */
3676 else if (ch == '\t') {
3677 *p++ = '\\';
3678 *p++ = 't';
3679 }
3680 else if (ch == '\n') {
3681 *p++ = '\\';
3682 *p++ = 'n';
3683 }
3684 else if (ch == '\r') {
3685 *p++ = '\\';
3686 *p++ = 'r';
3687 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003688
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003689 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003690 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003692 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003693 *p++ = hexdigits[(ch >> 4) & 0x000F];
3694 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003695 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003696
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 /* Copy everything else as-is */
3698 else
3699 *p++ = (char) ch;
3700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003702 assert(p - PyBytes_AS_STRING(repr) > 0);
3703 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3704 return NULL;
3705 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706}
3707
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003708PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003710 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 if (!PyUnicode_Check(unicode)) {
3712 PyErr_BadArgument();
3713 return NULL;
3714 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003715 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3716 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003717 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718}
3719
3720/* --- Raw Unicode Escape Codec ------------------------------------------- */
3721
3722PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 Py_ssize_t size,
3724 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003725{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003727 Py_ssize_t startinpos;
3728 Py_ssize_t endinpos;
3729 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 const char *end;
3733 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 PyObject *errorHandler = NULL;
3735 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 /* Escaped strings will always be longer than the resulting
3738 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 length after conversion to the true value. (But decoding error
3740 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 v = _PyUnicode_New(size);
3742 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 end = s + size;
3748 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003749 unsigned char c;
3750 Py_UCS4 x;
3751 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003752 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 /* Non-escape characters are interpreted as Unicode ordinals */
3755 if (*s != '\\') {
3756 *p++ = (unsigned char)*s++;
3757 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003758 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 startinpos = s-starts;
3760
3761 /* \u-escapes are only interpreted iff the number of leading
3762 backslashes if odd */
3763 bs = s;
3764 for (;s < end;) {
3765 if (*s != '\\')
3766 break;
3767 *p++ = (unsigned char)*s++;
3768 }
3769 if (((s - bs) & 1) == 0 ||
3770 s >= end ||
3771 (*s != 'u' && *s != 'U')) {
3772 continue;
3773 }
3774 p--;
3775 count = *s=='u' ? 4 : 8;
3776 s++;
3777
3778 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3779 outpos = p-PyUnicode_AS_UNICODE(v);
3780 for (x = 0, i = 0; i < count; ++i, ++s) {
3781 c = (unsigned char)*s;
3782 if (!ISXDIGIT(c)) {
3783 endinpos = s-starts;
3784 if (unicode_decode_call_errorhandler(
3785 errors, &errorHandler,
3786 "rawunicodeescape", "truncated \\uXXXX",
3787 &starts, &end, &startinpos, &endinpos, &exc, &s,
3788 &v, &outpos, &p))
3789 goto onError;
3790 goto nextByte;
3791 }
3792 x = (x<<4) & ~0xF;
3793 if (c >= '0' && c <= '9')
3794 x += c - '0';
3795 else if (c >= 'a' && c <= 'f')
3796 x += 10 + c - 'a';
3797 else
3798 x += 10 + c - 'A';
3799 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003800 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 /* UCS-2 character */
3802 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003803 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 /* UCS-4 character. Either store directly, or as
3805 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003806#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003808#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003809 x -= 0x10000L;
3810 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3811 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003812#endif
3813 } else {
3814 endinpos = s-starts;
3815 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003816 if (unicode_decode_call_errorhandler(
3817 errors, &errorHandler,
3818 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 &starts, &end, &startinpos, &endinpos, &exc, &s,
3820 &v, &outpos, &p))
3821 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 nextByte:
3824 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003826 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 Py_XDECREF(errorHandler);
3829 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003831
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003834 Py_XDECREF(errorHandler);
3835 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 return NULL;
3837}
3838
3839PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003842 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 char *p;
3844 char *q;
3845
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003846#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003847 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003848#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003849 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003850#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003851
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003852 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003854
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003855 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 if (repr == NULL)
3857 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003858 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003859 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003861 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 while (size-- > 0) {
3863 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003864#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 /* Map 32-bit characters to '\Uxxxxxxxx' */
3866 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003867 *p++ = '\\';
3868 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003869 *p++ = hexdigits[(ch >> 28) & 0xf];
3870 *p++ = hexdigits[(ch >> 24) & 0xf];
3871 *p++ = hexdigits[(ch >> 20) & 0xf];
3872 *p++ = hexdigits[(ch >> 16) & 0xf];
3873 *p++ = hexdigits[(ch >> 12) & 0xf];
3874 *p++ = hexdigits[(ch >> 8) & 0xf];
3875 *p++ = hexdigits[(ch >> 4) & 0xf];
3876 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003877 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003878 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003879#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003880 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3881 if (ch >= 0xD800 && ch < 0xDC00) {
3882 Py_UNICODE ch2;
3883 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003884
Benjamin Peterson29060642009-01-31 22:14:21 +00003885 ch2 = *s++;
3886 size--;
3887 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3888 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3889 *p++ = '\\';
3890 *p++ = 'U';
3891 *p++ = hexdigits[(ucs >> 28) & 0xf];
3892 *p++ = hexdigits[(ucs >> 24) & 0xf];
3893 *p++ = hexdigits[(ucs >> 20) & 0xf];
3894 *p++ = hexdigits[(ucs >> 16) & 0xf];
3895 *p++ = hexdigits[(ucs >> 12) & 0xf];
3896 *p++ = hexdigits[(ucs >> 8) & 0xf];
3897 *p++ = hexdigits[(ucs >> 4) & 0xf];
3898 *p++ = hexdigits[ucs & 0xf];
3899 continue;
3900 }
3901 /* Fall through: isolated surrogates are copied as-is */
3902 s--;
3903 size++;
3904 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003905#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 /* Map 16-bit characters to '\uxxxx' */
3907 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 *p++ = '\\';
3909 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003910 *p++ = hexdigits[(ch >> 12) & 0xf];
3911 *p++ = hexdigits[(ch >> 8) & 0xf];
3912 *p++ = hexdigits[(ch >> 4) & 0xf];
3913 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003915 /* Copy everything else as-is */
3916 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 *p++ = (char) ch;
3918 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003919 size = p - q;
3920
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003921 assert(size > 0);
3922 if (_PyBytes_Resize(&repr, size) < 0)
3923 return NULL;
3924 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925}
3926
3927PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3928{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003929 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003931 PyErr_BadArgument();
3932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003934 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3935 PyUnicode_GET_SIZE(unicode));
3936
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003937 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938}
3939
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003940/* --- Unicode Internal Codec ------------------------------------------- */
3941
3942PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 Py_ssize_t size,
3944 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003945{
3946 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003947 Py_ssize_t startinpos;
3948 Py_ssize_t endinpos;
3949 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003950 PyUnicodeObject *v;
3951 Py_UNICODE *p;
3952 const char *end;
3953 const char *reason;
3954 PyObject *errorHandler = NULL;
3955 PyObject *exc = NULL;
3956
Neal Norwitzd43069c2006-01-08 01:12:10 +00003957#ifdef Py_UNICODE_WIDE
3958 Py_UNICODE unimax = PyUnicode_GetMax();
3959#endif
3960
Thomas Wouters89f507f2006-12-13 04:49:30 +00003961 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003962 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3963 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003965 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003966 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003967 p = PyUnicode_AS_UNICODE(v);
3968 end = s + size;
3969
3970 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003971 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003972 /* We have to sanity check the raw data, otherwise doom looms for
3973 some malformed UCS-4 data. */
3974 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003975#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003976 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003977#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003978 end-s < Py_UNICODE_SIZE
3979 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003981 startinpos = s - starts;
3982 if (end-s < Py_UNICODE_SIZE) {
3983 endinpos = end-starts;
3984 reason = "truncated input";
3985 }
3986 else {
3987 endinpos = s - starts + Py_UNICODE_SIZE;
3988 reason = "illegal code point (> 0x10FFFF)";
3989 }
3990 outpos = p - PyUnicode_AS_UNICODE(v);
3991 if (unicode_decode_call_errorhandler(
3992 errors, &errorHandler,
3993 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003994 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003995 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003996 goto onError;
3997 }
3998 }
3999 else {
4000 p++;
4001 s += Py_UNICODE_SIZE;
4002 }
4003 }
4004
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004005 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004006 goto onError;
4007 Py_XDECREF(errorHandler);
4008 Py_XDECREF(exc);
4009 return (PyObject *)v;
4010
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004012 Py_XDECREF(v);
4013 Py_XDECREF(errorHandler);
4014 Py_XDECREF(exc);
4015 return NULL;
4016}
4017
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018/* --- Latin-1 Codec ------------------------------------------------------ */
4019
4020PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004021 Py_ssize_t size,
4022 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023{
4024 PyUnicodeObject *v;
4025 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004026 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004027
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004029 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004030 Py_UNICODE r = *(unsigned char*)s;
4031 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004032 }
4033
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 v = _PyUnicode_New(size);
4035 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004040 e = s + size;
4041 /* Unrolling the copy makes it much faster by reducing the looping
4042 overhead. This is similar to what many memcpy() implementations do. */
4043 unrolled_end = e - 4;
4044 while (s < unrolled_end) {
4045 p[0] = (unsigned char) s[0];
4046 p[1] = (unsigned char) s[1];
4047 p[2] = (unsigned char) s[2];
4048 p[3] = (unsigned char) s[3];
4049 s += 4;
4050 p += 4;
4051 }
4052 while (s < e)
4053 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004055
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 Py_XDECREF(v);
4058 return NULL;
4059}
4060
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061/* create or adjust a UnicodeEncodeError */
4062static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 const char *encoding,
4064 const Py_UNICODE *unicode, Py_ssize_t size,
4065 Py_ssize_t startpos, Py_ssize_t endpos,
4066 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004068 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 *exceptionObject = PyUnicodeEncodeError_Create(
4070 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 }
4072 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4074 goto onError;
4075 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4076 goto onError;
4077 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4078 goto onError;
4079 return;
4080 onError:
4081 Py_DECREF(*exceptionObject);
4082 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
4084}
4085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086/* raises a UnicodeEncodeError */
4087static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 const char *encoding,
4089 const Py_UNICODE *unicode, Py_ssize_t size,
4090 Py_ssize_t startpos, Py_ssize_t endpos,
4091 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092{
4093 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004094 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097}
4098
4099/* error handling callback helper:
4100 build arguments, call the callback and check the arguments,
4101 put the result into newpos and return the replacement string, which
4102 has to be freed by the caller */
4103static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 PyObject **errorHandler,
4105 const char *encoding, const char *reason,
4106 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4107 Py_ssize_t startpos, Py_ssize_t endpos,
4108 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004110 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111
4112 PyObject *restuple;
4113 PyObject *resunicode;
4114
4115 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 }
4120
4121 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125
4126 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004131 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 Py_DECREF(restuple);
4133 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004135 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 &resunicode, newpos)) {
4137 Py_DECREF(restuple);
4138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004140 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4141 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4142 Py_DECREF(restuple);
4143 return NULL;
4144 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004147 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4149 Py_DECREF(restuple);
4150 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 Py_INCREF(resunicode);
4153 Py_DECREF(restuple);
4154 return resunicode;
4155}
4156
4157static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 Py_ssize_t size,
4159 const char *errors,
4160 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161{
4162 /* output object */
4163 PyObject *res;
4164 /* pointers to the beginning and end+1 of input */
4165 const Py_UNICODE *startp = p;
4166 const Py_UNICODE *endp = p + size;
4167 /* pointer to the beginning of the unencodable characters */
4168 /* const Py_UNICODE *badp = NULL; */
4169 /* pointer into the output */
4170 char *str;
4171 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004172 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004173 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4174 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 PyObject *errorHandler = NULL;
4176 PyObject *exc = NULL;
4177 /* the following variable is used for caching string comparisons
4178 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4179 int known_errorHandler = -1;
4180
4181 /* allocate enough for a simple encoding without
4182 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004183 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004184 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004185 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004187 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004188 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 ressize = size;
4190
4191 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 /* can we encode this? */
4195 if (c<limit) {
4196 /* no overflow check, because we know that the space is enough */
4197 *str++ = (char)c;
4198 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004199 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 else {
4201 Py_ssize_t unicodepos = p-startp;
4202 Py_ssize_t requiredsize;
4203 PyObject *repunicode;
4204 Py_ssize_t repsize;
4205 Py_ssize_t newpos;
4206 Py_ssize_t respos;
4207 Py_UNICODE *uni2;
4208 /* startpos for collecting unencodable chars */
4209 const Py_UNICODE *collstart = p;
4210 const Py_UNICODE *collend = p;
4211 /* find all unecodable characters */
4212 while ((collend < endp) && ((*collend)>=limit))
4213 ++collend;
4214 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4215 if (known_errorHandler==-1) {
4216 if ((errors==NULL) || (!strcmp(errors, "strict")))
4217 known_errorHandler = 1;
4218 else if (!strcmp(errors, "replace"))
4219 known_errorHandler = 2;
4220 else if (!strcmp(errors, "ignore"))
4221 known_errorHandler = 3;
4222 else if (!strcmp(errors, "xmlcharrefreplace"))
4223 known_errorHandler = 4;
4224 else
4225 known_errorHandler = 0;
4226 }
4227 switch (known_errorHandler) {
4228 case 1: /* strict */
4229 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4230 goto onError;
4231 case 2: /* replace */
4232 while (collstart++<collend)
4233 *str++ = '?'; /* fall through */
4234 case 3: /* ignore */
4235 p = collend;
4236 break;
4237 case 4: /* xmlcharrefreplace */
4238 respos = str - PyBytes_AS_STRING(res);
4239 /* determine replacement size (temporarily (mis)uses p) */
4240 for (p = collstart, repsize = 0; p < collend; ++p) {
4241 if (*p<10)
4242 repsize += 2+1+1;
4243 else if (*p<100)
4244 repsize += 2+2+1;
4245 else if (*p<1000)
4246 repsize += 2+3+1;
4247 else if (*p<10000)
4248 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004249#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 else
4251 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004252#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 else if (*p<100000)
4254 repsize += 2+5+1;
4255 else if (*p<1000000)
4256 repsize += 2+6+1;
4257 else
4258 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004259#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 }
4261 requiredsize = respos+repsize+(endp-collend);
4262 if (requiredsize > ressize) {
4263 if (requiredsize<2*ressize)
4264 requiredsize = 2*ressize;
4265 if (_PyBytes_Resize(&res, requiredsize))
4266 goto onError;
4267 str = PyBytes_AS_STRING(res) + respos;
4268 ressize = requiredsize;
4269 }
4270 /* generate replacement (temporarily (mis)uses p) */
4271 for (p = collstart; p < collend; ++p) {
4272 str += sprintf(str, "&#%d;", (int)*p);
4273 }
4274 p = collend;
4275 break;
4276 default:
4277 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4278 encoding, reason, startp, size, &exc,
4279 collstart-startp, collend-startp, &newpos);
4280 if (repunicode == NULL)
4281 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004282 if (PyBytes_Check(repunicode)) {
4283 /* Directly copy bytes result to output. */
4284 repsize = PyBytes_Size(repunicode);
4285 if (repsize > 1) {
4286 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004287 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004288 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4289 Py_DECREF(repunicode);
4290 goto onError;
4291 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004292 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004293 ressize += repsize-1;
4294 }
4295 memcpy(str, PyBytes_AsString(repunicode), repsize);
4296 str += repsize;
4297 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004298 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004299 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004300 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 /* need more space? (at least enough for what we
4302 have+the replacement+the rest of the string, so
4303 we won't have to check space for encodable characters) */
4304 respos = str - PyBytes_AS_STRING(res);
4305 repsize = PyUnicode_GET_SIZE(repunicode);
4306 requiredsize = respos+repsize+(endp-collend);
4307 if (requiredsize > ressize) {
4308 if (requiredsize<2*ressize)
4309 requiredsize = 2*ressize;
4310 if (_PyBytes_Resize(&res, requiredsize)) {
4311 Py_DECREF(repunicode);
4312 goto onError;
4313 }
4314 str = PyBytes_AS_STRING(res) + respos;
4315 ressize = requiredsize;
4316 }
4317 /* check if there is anything unencodable in the replacement
4318 and copy it to the output */
4319 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4320 c = *uni2;
4321 if (c >= limit) {
4322 raise_encode_exception(&exc, encoding, startp, size,
4323 unicodepos, unicodepos+1, reason);
4324 Py_DECREF(repunicode);
4325 goto onError;
4326 }
4327 *str = (char)c;
4328 }
4329 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004330 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004331 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004332 }
4333 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004334 /* Resize if we allocated to much */
4335 size = str - PyBytes_AS_STRING(res);
4336 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004337 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004338 if (_PyBytes_Resize(&res, size) < 0)
4339 goto onError;
4340 }
4341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 Py_XDECREF(errorHandler);
4343 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004344 return res;
4345
4346 onError:
4347 Py_XDECREF(res);
4348 Py_XDECREF(errorHandler);
4349 Py_XDECREF(exc);
4350 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351}
4352
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 Py_ssize_t size,
4355 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358}
4359
4360PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4361{
4362 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 PyErr_BadArgument();
4364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 }
4366 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 PyUnicode_GET_SIZE(unicode),
4368 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369}
4370
4371/* --- 7-bit ASCII Codec -------------------------------------------------- */
4372
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 Py_ssize_t size,
4375 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 PyUnicodeObject *v;
4379 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004380 Py_ssize_t startinpos;
4381 Py_ssize_t endinpos;
4382 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 const char *e;
4384 PyObject *errorHandler = NULL;
4385 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004388 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 Py_UNICODE r = *(unsigned char*)s;
4390 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004391 }
Tim Petersced69f82003-09-16 20:30:58 +00004392
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 v = _PyUnicode_New(size);
4394 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 e = s + size;
4400 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 register unsigned char c = (unsigned char)*s;
4402 if (c < 128) {
4403 *p++ = c;
4404 ++s;
4405 }
4406 else {
4407 startinpos = s-starts;
4408 endinpos = startinpos + 1;
4409 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4410 if (unicode_decode_call_errorhandler(
4411 errors, &errorHandler,
4412 "ascii", "ordinal not in range(128)",
4413 &starts, &e, &startinpos, &endinpos, &exc, &s,
4414 &v, &outpos, &p))
4415 goto onError;
4416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004418 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4420 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 Py_XDECREF(errorHandler);
4422 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004424
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 Py_XDECREF(errorHandler);
4428 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 return NULL;
4430}
4431
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 Py_ssize_t size,
4434 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437}
4438
4439PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4440{
4441 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 PyErr_BadArgument();
4443 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 }
4445 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 PyUnicode_GET_SIZE(unicode),
4447 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448}
4449
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004450#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004451
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004452/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004453
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004454#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004455#define NEED_RETRY
4456#endif
4457
4458/* XXX This code is limited to "true" double-byte encodings, as
4459 a) it assumes an incomplete character consists of a single byte, and
4460 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004462
4463static int is_dbcs_lead_byte(const char *s, int offset)
4464{
4465 const char *curr = s + offset;
4466
4467 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 const char *prev = CharPrev(s, curr);
4469 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004470 }
4471 return 0;
4472}
4473
4474/*
4475 * Decode MBCS string into unicode object. If 'final' is set, converts
4476 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4477 */
4478static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 const char *s, /* MBCS string */
4480 int size, /* sizeof MBCS string */
4481 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004482{
4483 Py_UNICODE *p;
4484 Py_ssize_t n = 0;
4485 int usize = 0;
4486
4487 assert(size >= 0);
4488
4489 /* Skip trailing lead-byte unless 'final' is set */
4490 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004492
4493 /* First get the size of the result */
4494 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4496 if (usize == 0) {
4497 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4498 return -1;
4499 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004500 }
4501
4502 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 /* Create unicode object */
4504 *v = _PyUnicode_New(usize);
4505 if (*v == NULL)
4506 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004507 }
4508 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 /* Extend unicode object */
4510 n = PyUnicode_GET_SIZE(*v);
4511 if (_PyUnicode_Resize(v, n + usize) < 0)
4512 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004513 }
4514
4515 /* Do the conversion */
4516 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 p = PyUnicode_AS_UNICODE(*v) + n;
4518 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4519 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4520 return -1;
4521 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004522 }
4523
4524 return size;
4525}
4526
4527PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 Py_ssize_t size,
4529 const char *errors,
4530 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004531{
4532 PyUnicodeObject *v = NULL;
4533 int done;
4534
4535 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004537
4538#ifdef NEED_RETRY
4539 retry:
4540 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004542 else
4543#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004545
4546 if (done < 0) {
4547 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549 }
4550
4551 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004553
4554#ifdef NEED_RETRY
4555 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 s += done;
4557 size -= done;
4558 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004559 }
4560#endif
4561
4562 return (PyObject *)v;
4563}
4564
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004565PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004566 Py_ssize_t size,
4567 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004568{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004569 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4570}
4571
4572/*
4573 * Convert unicode into string object (MBCS).
4574 * Returns 0 if succeed, -1 otherwise.
4575 */
4576static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 const Py_UNICODE *p, /* unicode */
4578 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004579{
4580 int mbcssize = 0;
4581 Py_ssize_t n = 0;
4582
4583 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004584
4585 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004586 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4588 if (mbcssize == 0) {
4589 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4590 return -1;
4591 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004592 }
4593
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004594 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 /* Create string object */
4596 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4597 if (*repr == NULL)
4598 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004599 }
4600 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 /* Extend string object */
4602 n = PyBytes_Size(*repr);
4603 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4604 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605 }
4606
4607 /* Do the conversion */
4608 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 char *s = PyBytes_AS_STRING(*repr) + n;
4610 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4611 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4612 return -1;
4613 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614 }
4615
4616 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004617}
4618
4619PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 Py_ssize_t size,
4621 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004622{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004623 PyObject *repr = NULL;
4624 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004625
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004626#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004628 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004630 else
4631#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004633
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004634 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 Py_XDECREF(repr);
4636 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004637 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004638
4639#ifdef NEED_RETRY
4640 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 p += INT_MAX;
4642 size -= INT_MAX;
4643 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004644 }
4645#endif
4646
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004647 return repr;
4648}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004649
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004650PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4651{
4652 if (!PyUnicode_Check(unicode)) {
4653 PyErr_BadArgument();
4654 return NULL;
4655 }
4656 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 PyUnicode_GET_SIZE(unicode),
4658 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004659}
4660
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661#undef NEED_RETRY
4662
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004663#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004664
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665/* --- Character Mapping Codec -------------------------------------------- */
4666
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 Py_ssize_t size,
4669 PyObject *mapping,
4670 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004673 Py_ssize_t startinpos;
4674 Py_ssize_t endinpos;
4675 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 PyUnicodeObject *v;
4678 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004679 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 PyObject *errorHandler = NULL;
4681 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004682 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004683 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004684
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 /* Default to Latin-1 */
4686 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688
4689 v = _PyUnicode_New(size);
4690 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004696 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 mapstring = PyUnicode_AS_UNICODE(mapping);
4698 maplen = PyUnicode_GET_SIZE(mapping);
4699 while (s < e) {
4700 unsigned char ch = *s;
4701 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 if (ch < maplen)
4704 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 if (x == 0xfffe) {
4707 /* undefined mapping */
4708 outpos = p-PyUnicode_AS_UNICODE(v);
4709 startinpos = s-starts;
4710 endinpos = startinpos+1;
4711 if (unicode_decode_call_errorhandler(
4712 errors, &errorHandler,
4713 "charmap", "character maps to <undefined>",
4714 &starts, &e, &startinpos, &endinpos, &exc, &s,
4715 &v, &outpos, &p)) {
4716 goto onError;
4717 }
4718 continue;
4719 }
4720 *p++ = x;
4721 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004722 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004723 }
4724 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 while (s < e) {
4726 unsigned char ch = *s;
4727 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004728
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4730 w = PyLong_FromLong((long)ch);
4731 if (w == NULL)
4732 goto onError;
4733 x = PyObject_GetItem(mapping, w);
4734 Py_DECREF(w);
4735 if (x == NULL) {
4736 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4737 /* No mapping found means: mapping is undefined. */
4738 PyErr_Clear();
4739 x = Py_None;
4740 Py_INCREF(x);
4741 } else
4742 goto onError;
4743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004744
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 /* Apply mapping */
4746 if (PyLong_Check(x)) {
4747 long value = PyLong_AS_LONG(x);
4748 if (value < 0 || value > 65535) {
4749 PyErr_SetString(PyExc_TypeError,
4750 "character mapping must be in range(65536)");
4751 Py_DECREF(x);
4752 goto onError;
4753 }
4754 *p++ = (Py_UNICODE)value;
4755 }
4756 else if (x == Py_None) {
4757 /* undefined mapping */
4758 outpos = p-PyUnicode_AS_UNICODE(v);
4759 startinpos = s-starts;
4760 endinpos = startinpos+1;
4761 if (unicode_decode_call_errorhandler(
4762 errors, &errorHandler,
4763 "charmap", "character maps to <undefined>",
4764 &starts, &e, &startinpos, &endinpos, &exc, &s,
4765 &v, &outpos, &p)) {
4766 Py_DECREF(x);
4767 goto onError;
4768 }
4769 Py_DECREF(x);
4770 continue;
4771 }
4772 else if (PyUnicode_Check(x)) {
4773 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004774
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 if (targetsize == 1)
4776 /* 1-1 mapping */
4777 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004778
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 else if (targetsize > 1) {
4780 /* 1-n mapping */
4781 if (targetsize > extrachars) {
4782 /* resize first */
4783 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4784 Py_ssize_t needed = (targetsize - extrachars) + \
4785 (targetsize << 2);
4786 extrachars += needed;
4787 /* XXX overflow detection missing */
4788 if (_PyUnicode_Resize(&v,
4789 PyUnicode_GET_SIZE(v) + needed) < 0) {
4790 Py_DECREF(x);
4791 goto onError;
4792 }
4793 p = PyUnicode_AS_UNICODE(v) + oldpos;
4794 }
4795 Py_UNICODE_COPY(p,
4796 PyUnicode_AS_UNICODE(x),
4797 targetsize);
4798 p += targetsize;
4799 extrachars -= targetsize;
4800 }
4801 /* 1-0 mapping: skip the character */
4802 }
4803 else {
4804 /* wrong return value */
4805 PyErr_SetString(PyExc_TypeError,
4806 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004807 Py_DECREF(x);
4808 goto onError;
4809 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 Py_DECREF(x);
4811 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 }
4814 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004815 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4816 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 Py_XDECREF(errorHandler);
4818 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004820
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 Py_XDECREF(errorHandler);
4823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 Py_XDECREF(v);
4825 return NULL;
4826}
4827
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004828/* Charmap encoding: the lookup table */
4829
4830struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 PyObject_HEAD
4832 unsigned char level1[32];
4833 int count2, count3;
4834 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004835};
4836
4837static PyObject*
4838encoding_map_size(PyObject *obj, PyObject* args)
4839{
4840 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004841 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004843}
4844
4845static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004846 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 PyDoc_STR("Return the size (in bytes) of this object") },
4848 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004849};
4850
4851static void
4852encoding_map_dealloc(PyObject* o)
4853{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004854 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004855}
4856
4857static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004858 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 "EncodingMap", /*tp_name*/
4860 sizeof(struct encoding_map), /*tp_basicsize*/
4861 0, /*tp_itemsize*/
4862 /* methods */
4863 encoding_map_dealloc, /*tp_dealloc*/
4864 0, /*tp_print*/
4865 0, /*tp_getattr*/
4866 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004867 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 0, /*tp_repr*/
4869 0, /*tp_as_number*/
4870 0, /*tp_as_sequence*/
4871 0, /*tp_as_mapping*/
4872 0, /*tp_hash*/
4873 0, /*tp_call*/
4874 0, /*tp_str*/
4875 0, /*tp_getattro*/
4876 0, /*tp_setattro*/
4877 0, /*tp_as_buffer*/
4878 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4879 0, /*tp_doc*/
4880 0, /*tp_traverse*/
4881 0, /*tp_clear*/
4882 0, /*tp_richcompare*/
4883 0, /*tp_weaklistoffset*/
4884 0, /*tp_iter*/
4885 0, /*tp_iternext*/
4886 encoding_map_methods, /*tp_methods*/
4887 0, /*tp_members*/
4888 0, /*tp_getset*/
4889 0, /*tp_base*/
4890 0, /*tp_dict*/
4891 0, /*tp_descr_get*/
4892 0, /*tp_descr_set*/
4893 0, /*tp_dictoffset*/
4894 0, /*tp_init*/
4895 0, /*tp_alloc*/
4896 0, /*tp_new*/
4897 0, /*tp_free*/
4898 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004899};
4900
4901PyObject*
4902PyUnicode_BuildEncodingMap(PyObject* string)
4903{
4904 Py_UNICODE *decode;
4905 PyObject *result;
4906 struct encoding_map *mresult;
4907 int i;
4908 int need_dict = 0;
4909 unsigned char level1[32];
4910 unsigned char level2[512];
4911 unsigned char *mlevel1, *mlevel2, *mlevel3;
4912 int count2 = 0, count3 = 0;
4913
4914 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4915 PyErr_BadArgument();
4916 return NULL;
4917 }
4918 decode = PyUnicode_AS_UNICODE(string);
4919 memset(level1, 0xFF, sizeof level1);
4920 memset(level2, 0xFF, sizeof level2);
4921
4922 /* If there isn't a one-to-one mapping of NULL to \0,
4923 or if there are non-BMP characters, we need to use
4924 a mapping dictionary. */
4925 if (decode[0] != 0)
4926 need_dict = 1;
4927 for (i = 1; i < 256; i++) {
4928 int l1, l2;
4929 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004930#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004931 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004932#endif
4933 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004934 need_dict = 1;
4935 break;
4936 }
4937 if (decode[i] == 0xFFFE)
4938 /* unmapped character */
4939 continue;
4940 l1 = decode[i] >> 11;
4941 l2 = decode[i] >> 7;
4942 if (level1[l1] == 0xFF)
4943 level1[l1] = count2++;
4944 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004945 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004946 }
4947
4948 if (count2 >= 0xFF || count3 >= 0xFF)
4949 need_dict = 1;
4950
4951 if (need_dict) {
4952 PyObject *result = PyDict_New();
4953 PyObject *key, *value;
4954 if (!result)
4955 return NULL;
4956 for (i = 0; i < 256; i++) {
4957 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004958 key = PyLong_FromLong(decode[i]);
4959 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004960 if (!key || !value)
4961 goto failed1;
4962 if (PyDict_SetItem(result, key, value) == -1)
4963 goto failed1;
4964 Py_DECREF(key);
4965 Py_DECREF(value);
4966 }
4967 return result;
4968 failed1:
4969 Py_XDECREF(key);
4970 Py_XDECREF(value);
4971 Py_DECREF(result);
4972 return NULL;
4973 }
4974
4975 /* Create a three-level trie */
4976 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4977 16*count2 + 128*count3 - 1);
4978 if (!result)
4979 return PyErr_NoMemory();
4980 PyObject_Init(result, &EncodingMapType);
4981 mresult = (struct encoding_map*)result;
4982 mresult->count2 = count2;
4983 mresult->count3 = count3;
4984 mlevel1 = mresult->level1;
4985 mlevel2 = mresult->level23;
4986 mlevel3 = mresult->level23 + 16*count2;
4987 memcpy(mlevel1, level1, 32);
4988 memset(mlevel2, 0xFF, 16*count2);
4989 memset(mlevel3, 0, 128*count3);
4990 count3 = 0;
4991 for (i = 1; i < 256; i++) {
4992 int o1, o2, o3, i2, i3;
4993 if (decode[i] == 0xFFFE)
4994 /* unmapped character */
4995 continue;
4996 o1 = decode[i]>>11;
4997 o2 = (decode[i]>>7) & 0xF;
4998 i2 = 16*mlevel1[o1] + o2;
4999 if (mlevel2[i2] == 0xFF)
5000 mlevel2[i2] = count3++;
5001 o3 = decode[i] & 0x7F;
5002 i3 = 128*mlevel2[i2] + o3;
5003 mlevel3[i3] = i;
5004 }
5005 return result;
5006}
5007
5008static int
5009encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5010{
5011 struct encoding_map *map = (struct encoding_map*)mapping;
5012 int l1 = c>>11;
5013 int l2 = (c>>7) & 0xF;
5014 int l3 = c & 0x7F;
5015 int i;
5016
5017#ifdef Py_UNICODE_WIDE
5018 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005020 }
5021#endif
5022 if (c == 0)
5023 return 0;
5024 /* level 1*/
5025 i = map->level1[l1];
5026 if (i == 0xFF) {
5027 return -1;
5028 }
5029 /* level 2*/
5030 i = map->level23[16*i+l2];
5031 if (i == 0xFF) {
5032 return -1;
5033 }
5034 /* level 3 */
5035 i = map->level23[16*map->count2 + 128*i + l3];
5036 if (i == 0) {
5037 return -1;
5038 }
5039 return i;
5040}
5041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005042/* Lookup the character ch in the mapping. If the character
5043 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005044 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046{
Christian Heimes217cfd12007-12-02 14:31:20 +00005047 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 PyObject *x;
5049
5050 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005052 x = PyObject_GetItem(mapping, w);
5053 Py_DECREF(w);
5054 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5056 /* No mapping found means: mapping is undefined. */
5057 PyErr_Clear();
5058 x = Py_None;
5059 Py_INCREF(x);
5060 return x;
5061 } else
5062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005064 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005066 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 long value = PyLong_AS_LONG(x);
5068 if (value < 0 || value > 255) {
5069 PyErr_SetString(PyExc_TypeError,
5070 "character mapping must be in range(256)");
5071 Py_DECREF(x);
5072 return NULL;
5073 }
5074 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005076 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 /* wrong return value */
5080 PyErr_Format(PyExc_TypeError,
5081 "character mapping must return integer, bytes or None, not %.400s",
5082 x->ob_type->tp_name);
5083 Py_DECREF(x);
5084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085 }
5086}
5087
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005088static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005089charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005090{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005091 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5092 /* exponentially overallocate to minimize reallocations */
5093 if (requiredsize < 2*outsize)
5094 requiredsize = 2*outsize;
5095 if (_PyBytes_Resize(outobj, requiredsize))
5096 return -1;
5097 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005098}
5099
Benjamin Peterson14339b62009-01-31 16:36:08 +00005100typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005102}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005104 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 space is available. Return a new reference to the object that
5106 was put in the output buffer, or Py_None, if the mapping was undefined
5107 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005108 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005110charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005113 PyObject *rep;
5114 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005115 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116
Christian Heimes90aa7642007-12-19 02:45:37 +00005117 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005118 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005120 if (res == -1)
5121 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 if (outsize<requiredsize)
5123 if (charmapencode_resize(outobj, outpos, requiredsize))
5124 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005125 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 outstart[(*outpos)++] = (char)res;
5127 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005128 }
5129
5130 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005131 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005133 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 Py_DECREF(rep);
5135 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005136 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 if (PyLong_Check(rep)) {
5138 Py_ssize_t requiredsize = *outpos+1;
5139 if (outsize<requiredsize)
5140 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5141 Py_DECREF(rep);
5142 return enc_EXCEPTION;
5143 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005144 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 else {
5148 const char *repchars = PyBytes_AS_STRING(rep);
5149 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5150 Py_ssize_t requiredsize = *outpos+repsize;
5151 if (outsize<requiredsize)
5152 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5153 Py_DECREF(rep);
5154 return enc_EXCEPTION;
5155 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005156 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 memcpy(outstart + *outpos, repchars, repsize);
5158 *outpos += repsize;
5159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005161 Py_DECREF(rep);
5162 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005163}
5164
5165/* handle an error in PyUnicode_EncodeCharmap
5166 Return 0 on success, -1 on error */
5167static
5168int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005169 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005170 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005171 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005172 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005173{
5174 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005175 Py_ssize_t repsize;
5176 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005177 Py_UNICODE *uni2;
5178 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005179 Py_ssize_t collstartpos = *inpos;
5180 Py_ssize_t collendpos = *inpos+1;
5181 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 char *encoding = "charmap";
5183 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005184 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 /* find all unencodable characters */
5187 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005188 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005189 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005190 int res = encoding_map_lookup(p[collendpos], mapping);
5191 if (res != -1)
5192 break;
5193 ++collendpos;
5194 continue;
5195 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005196
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 rep = charmapencode_lookup(p[collendpos], mapping);
5198 if (rep==NULL)
5199 return -1;
5200 else if (rep!=Py_None) {
5201 Py_DECREF(rep);
5202 break;
5203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005204 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005206 }
5207 /* cache callback name lookup
5208 * (if not done yet, i.e. it's the first error) */
5209 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 if ((errors==NULL) || (!strcmp(errors, "strict")))
5211 *known_errorHandler = 1;
5212 else if (!strcmp(errors, "replace"))
5213 *known_errorHandler = 2;
5214 else if (!strcmp(errors, "ignore"))
5215 *known_errorHandler = 3;
5216 else if (!strcmp(errors, "xmlcharrefreplace"))
5217 *known_errorHandler = 4;
5218 else
5219 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005220 }
5221 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005222 case 1: /* strict */
5223 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5224 return -1;
5225 case 2: /* replace */
5226 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 x = charmapencode_output('?', mapping, res, respos);
5228 if (x==enc_EXCEPTION) {
5229 return -1;
5230 }
5231 else if (x==enc_FAILED) {
5232 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5233 return -1;
5234 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005235 }
5236 /* fall through */
5237 case 3: /* ignore */
5238 *inpos = collendpos;
5239 break;
5240 case 4: /* xmlcharrefreplace */
5241 /* generate replacement (temporarily (mis)uses p) */
5242 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 char buffer[2+29+1+1];
5244 char *cp;
5245 sprintf(buffer, "&#%d;", (int)p[collpos]);
5246 for (cp = buffer; *cp; ++cp) {
5247 x = charmapencode_output(*cp, mapping, res, respos);
5248 if (x==enc_EXCEPTION)
5249 return -1;
5250 else if (x==enc_FAILED) {
5251 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5252 return -1;
5253 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005254 }
5255 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005256 *inpos = collendpos;
5257 break;
5258 default:
5259 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 encoding, reason, p, size, exceptionObject,
5261 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005262 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005264 if (PyBytes_Check(repunicode)) {
5265 /* Directly copy bytes result to output. */
5266 Py_ssize_t outsize = PyBytes_Size(*res);
5267 Py_ssize_t requiredsize;
5268 repsize = PyBytes_Size(repunicode);
5269 requiredsize = *respos + repsize;
5270 if (requiredsize > outsize)
5271 /* Make room for all additional bytes. */
5272 if (charmapencode_resize(res, respos, requiredsize)) {
5273 Py_DECREF(repunicode);
5274 return -1;
5275 }
5276 memcpy(PyBytes_AsString(*res) + *respos,
5277 PyBytes_AsString(repunicode), repsize);
5278 *respos += repsize;
5279 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005280 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005281 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005283 /* generate replacement */
5284 repsize = PyUnicode_GET_SIZE(repunicode);
5285 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 x = charmapencode_output(*uni2, mapping, res, respos);
5287 if (x==enc_EXCEPTION) {
5288 return -1;
5289 }
5290 else if (x==enc_FAILED) {
5291 Py_DECREF(repunicode);
5292 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5293 return -1;
5294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005295 }
5296 *inpos = newpos;
5297 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298 }
5299 return 0;
5300}
5301
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 Py_ssize_t size,
5304 PyObject *mapping,
5305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005307 /* output object */
5308 PyObject *res = NULL;
5309 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005310 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005311 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005312 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 PyObject *errorHandler = NULL;
5314 PyObject *exc = NULL;
5315 /* the following variable is used for caching string comparisons
5316 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5317 * 3=ignore, 4=xmlcharrefreplace */
5318 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319
5320 /* Default to Latin-1 */
5321 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 /* allocate enough for a simple encoding without
5325 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005326 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327 if (res == NULL)
5328 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005329 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 /* try to encode it */
5334 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5335 if (x==enc_EXCEPTION) /* error */
5336 goto onError;
5337 if (x==enc_FAILED) { /* unencodable character */
5338 if (charmap_encoding_error(p, size, &inpos, mapping,
5339 &exc,
5340 &known_errorHandler, &errorHandler, errors,
5341 &res, &respos)) {
5342 goto onError;
5343 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005344 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 else
5346 /* done with this character => adjust input position */
5347 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005351 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005352 if (_PyBytes_Resize(&res, respos) < 0)
5353 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 Py_XDECREF(exc);
5356 Py_XDECREF(errorHandler);
5357 return res;
5358
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 Py_XDECREF(res);
5361 Py_XDECREF(exc);
5362 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 return NULL;
5364}
5365
5366PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
5369 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 PyErr_BadArgument();
5371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 }
5373 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 PyUnicode_GET_SIZE(unicode),
5375 mapping,
5376 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377}
5378
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379/* create or adjust a UnicodeTranslateError */
5380static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 const Py_UNICODE *unicode, Py_ssize_t size,
5382 Py_ssize_t startpos, Py_ssize_t endpos,
5383 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005386 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 }
5389 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5391 goto onError;
5392 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5393 goto onError;
5394 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5395 goto onError;
5396 return;
5397 onError:
5398 Py_DECREF(*exceptionObject);
5399 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 }
5401}
5402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403/* raises a UnicodeTranslateError */
5404static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 const Py_UNICODE *unicode, Py_ssize_t size,
5406 Py_ssize_t startpos, Py_ssize_t endpos,
5407 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005408{
5409 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413}
5414
5415/* error handling callback helper:
5416 build arguments, call the callback and check the arguments,
5417 put the result into newpos and return the replacement string, which
5418 has to be freed by the caller */
5419static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 PyObject **errorHandler,
5421 const char *reason,
5422 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5423 Py_ssize_t startpos, Py_ssize_t endpos,
5424 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005426 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005428 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429 PyObject *restuple;
5430 PyObject *resunicode;
5431
5432 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005436 }
5437
5438 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442
5443 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005448 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 Py_DECREF(restuple);
5450 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005451 }
5452 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 &resunicode, &i_newpos)) {
5454 Py_DECREF(restuple);
5455 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005456 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005457 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005459 else
5460 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005461 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5463 Py_DECREF(restuple);
5464 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005465 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005466 Py_INCREF(resunicode);
5467 Py_DECREF(restuple);
5468 return resunicode;
5469}
5470
5471/* Lookup the character ch in the mapping and put the result in result,
5472 which must be decrefed by the caller.
5473 Return 0 on success, -1 on error */
5474static
5475int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5476{
Christian Heimes217cfd12007-12-02 14:31:20 +00005477 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 PyObject *x;
5479
5480 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 x = PyObject_GetItem(mapping, w);
5483 Py_DECREF(w);
5484 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5486 /* No mapping found means: use 1:1 mapping. */
5487 PyErr_Clear();
5488 *result = NULL;
5489 return 0;
5490 } else
5491 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005492 }
5493 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 *result = x;
5495 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005497 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 long value = PyLong_AS_LONG(x);
5499 long max = PyUnicode_GetMax();
5500 if (value < 0 || value > max) {
5501 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005502 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 Py_DECREF(x);
5504 return -1;
5505 }
5506 *result = x;
5507 return 0;
5508 }
5509 else if (PyUnicode_Check(x)) {
5510 *result = x;
5511 return 0;
5512 }
5513 else {
5514 /* wrong return value */
5515 PyErr_SetString(PyExc_TypeError,
5516 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005517 Py_DECREF(x);
5518 return -1;
5519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520}
5521/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 if not reallocate and adjust various state variables.
5523 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524static
Walter Dörwald4894c302003-10-24 14:25:28 +00005525int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005528 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005529 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 /* remember old output position */
5531 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5532 /* exponentially overallocate to minimize reallocations */
5533 if (requiredsize < 2 * oldsize)
5534 requiredsize = 2 * oldsize;
5535 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5536 return -1;
5537 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 }
5539 return 0;
5540}
5541/* lookup the character, put the result in the output string and adjust
5542 various state variables. Return a new reference to the object that
5543 was put in the output buffer in *result, or Py_None, if the mapping was
5544 undefined (in which case no character was written).
5545 The called must decref result.
5546 Return 0 on success, -1 on error. */
5547static
Walter Dörwald4894c302003-10-24 14:25:28 +00005548int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5550 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551{
Walter Dörwald4894c302003-10-24 14:25:28 +00005552 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 /* not found => default to 1:1 mapping */
5556 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557 }
5558 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005560 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 /* no overflow check, because we know that the space is enough */
5562 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 }
5564 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5566 if (repsize==1) {
5567 /* no overflow check, because we know that the space is enough */
5568 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5569 }
5570 else if (repsize!=0) {
5571 /* more than one character */
5572 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5573 (insize - (curinp-startinp)) +
5574 repsize - 1;
5575 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5576 return -1;
5577 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5578 *outp += repsize;
5579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 }
5581 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005583 return 0;
5584}
5585
5586PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 Py_ssize_t size,
5588 PyObject *mapping,
5589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 /* output object */
5592 PyObject *res = NULL;
5593 /* pointers to the beginning and end+1 of input */
5594 const Py_UNICODE *startp = p;
5595 const Py_UNICODE *endp = p + size;
5596 /* pointer into the output */
5597 Py_UNICODE *str;
5598 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005599 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 char *reason = "character maps to <undefined>";
5601 PyObject *errorHandler = NULL;
5602 PyObject *exc = NULL;
5603 /* the following variable is used for caching string comparisons
5604 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5605 * 3=ignore, 4=xmlcharrefreplace */
5606 int known_errorHandler = -1;
5607
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 PyErr_BadArgument();
5610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612
5613 /* allocate enough for a simple 1:1 translation without
5614 replacements, if we need more, we'll resize */
5615 res = PyUnicode_FromUnicode(NULL, size);
5616 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 /* try to encode it */
5624 PyObject *x = NULL;
5625 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5626 Py_XDECREF(x);
5627 goto onError;
5628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005629 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 if (x!=Py_None) /* it worked => adjust input pointer */
5631 ++p;
5632 else { /* untranslatable character */
5633 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5634 Py_ssize_t repsize;
5635 Py_ssize_t newpos;
5636 Py_UNICODE *uni2;
5637 /* startpos for collecting untranslatable chars */
5638 const Py_UNICODE *collstart = p;
5639 const Py_UNICODE *collend = p+1;
5640 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 /* find all untranslatable characters */
5643 while (collend < endp) {
5644 if (charmaptranslate_lookup(*collend, mapping, &x))
5645 goto onError;
5646 Py_XDECREF(x);
5647 if (x!=Py_None)
5648 break;
5649 ++collend;
5650 }
5651 /* cache callback name lookup
5652 * (if not done yet, i.e. it's the first error) */
5653 if (known_errorHandler==-1) {
5654 if ((errors==NULL) || (!strcmp(errors, "strict")))
5655 known_errorHandler = 1;
5656 else if (!strcmp(errors, "replace"))
5657 known_errorHandler = 2;
5658 else if (!strcmp(errors, "ignore"))
5659 known_errorHandler = 3;
5660 else if (!strcmp(errors, "xmlcharrefreplace"))
5661 known_errorHandler = 4;
5662 else
5663 known_errorHandler = 0;
5664 }
5665 switch (known_errorHandler) {
5666 case 1: /* strict */
5667 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005668 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 case 2: /* replace */
5670 /* No need to check for space, this is a 1:1 replacement */
5671 for (coll = collstart; coll<collend; ++coll)
5672 *str++ = '?';
5673 /* fall through */
5674 case 3: /* ignore */
5675 p = collend;
5676 break;
5677 case 4: /* xmlcharrefreplace */
5678 /* generate replacement (temporarily (mis)uses p) */
5679 for (p = collstart; p < collend; ++p) {
5680 char buffer[2+29+1+1];
5681 char *cp;
5682 sprintf(buffer, "&#%d;", (int)*p);
5683 if (charmaptranslate_makespace(&res, &str,
5684 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5685 goto onError;
5686 for (cp = buffer; *cp; ++cp)
5687 *str++ = *cp;
5688 }
5689 p = collend;
5690 break;
5691 default:
5692 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5693 reason, startp, size, &exc,
5694 collstart-startp, collend-startp, &newpos);
5695 if (repunicode == NULL)
5696 goto onError;
5697 /* generate replacement */
5698 repsize = PyUnicode_GET_SIZE(repunicode);
5699 if (charmaptranslate_makespace(&res, &str,
5700 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5701 Py_DECREF(repunicode);
5702 goto onError;
5703 }
5704 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5705 *str++ = *uni2;
5706 p = startp + newpos;
5707 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005709 }
5710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 /* Resize if we allocated to much */
5712 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005713 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 if (PyUnicode_Resize(&res, respos) < 0)
5715 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 }
5717 Py_XDECREF(exc);
5718 Py_XDECREF(errorHandler);
5719 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 Py_XDECREF(res);
5723 Py_XDECREF(exc);
5724 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 return NULL;
5726}
5727
5728PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 PyObject *mapping,
5730 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731{
5732 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005733
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 str = PyUnicode_FromObject(str);
5735 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 PyUnicode_GET_SIZE(str),
5739 mapping,
5740 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 Py_DECREF(str);
5742 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005743
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 Py_XDECREF(str);
5746 return NULL;
5747}
Tim Petersced69f82003-09-16 20:30:58 +00005748
Guido van Rossum9e896b32000-04-05 20:11:21 +00005749/* --- Decimal Encoder ---------------------------------------------------- */
5750
5751int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 Py_ssize_t length,
5753 char *output,
5754 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005755{
5756 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 PyObject *errorHandler = NULL;
5758 PyObject *exc = NULL;
5759 const char *encoding = "decimal";
5760 const char *reason = "invalid decimal Unicode string";
5761 /* the following variable is used for caching string comparisons
5762 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5763 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005764
5765 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 PyErr_BadArgument();
5767 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005768 }
5769
5770 p = s;
5771 end = s + length;
5772 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 register Py_UNICODE ch = *p;
5774 int decimal;
5775 PyObject *repunicode;
5776 Py_ssize_t repsize;
5777 Py_ssize_t newpos;
5778 Py_UNICODE *uni2;
5779 Py_UNICODE *collstart;
5780 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005781
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005783 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 ++p;
5785 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005786 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 decimal = Py_UNICODE_TODECIMAL(ch);
5788 if (decimal >= 0) {
5789 *output++ = '0' + decimal;
5790 ++p;
5791 continue;
5792 }
5793 if (0 < ch && ch < 256) {
5794 *output++ = (char)ch;
5795 ++p;
5796 continue;
5797 }
5798 /* All other characters are considered unencodable */
5799 collstart = p;
5800 collend = p+1;
5801 while (collend < end) {
5802 if ((0 < *collend && *collend < 256) ||
5803 !Py_UNICODE_ISSPACE(*collend) ||
5804 Py_UNICODE_TODECIMAL(*collend))
5805 break;
5806 }
5807 /* cache callback name lookup
5808 * (if not done yet, i.e. it's the first error) */
5809 if (known_errorHandler==-1) {
5810 if ((errors==NULL) || (!strcmp(errors, "strict")))
5811 known_errorHandler = 1;
5812 else if (!strcmp(errors, "replace"))
5813 known_errorHandler = 2;
5814 else if (!strcmp(errors, "ignore"))
5815 known_errorHandler = 3;
5816 else if (!strcmp(errors, "xmlcharrefreplace"))
5817 known_errorHandler = 4;
5818 else
5819 known_errorHandler = 0;
5820 }
5821 switch (known_errorHandler) {
5822 case 1: /* strict */
5823 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5824 goto onError;
5825 case 2: /* replace */
5826 for (p = collstart; p < collend; ++p)
5827 *output++ = '?';
5828 /* fall through */
5829 case 3: /* ignore */
5830 p = collend;
5831 break;
5832 case 4: /* xmlcharrefreplace */
5833 /* generate replacement (temporarily (mis)uses p) */
5834 for (p = collstart; p < collend; ++p)
5835 output += sprintf(output, "&#%d;", (int)*p);
5836 p = collend;
5837 break;
5838 default:
5839 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5840 encoding, reason, s, length, &exc,
5841 collstart-s, collend-s, &newpos);
5842 if (repunicode == NULL)
5843 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005844 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005845 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005846 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5847 Py_DECREF(repunicode);
5848 goto onError;
5849 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 /* generate replacement */
5851 repsize = PyUnicode_GET_SIZE(repunicode);
5852 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5853 Py_UNICODE ch = *uni2;
5854 if (Py_UNICODE_ISSPACE(ch))
5855 *output++ = ' ';
5856 else {
5857 decimal = Py_UNICODE_TODECIMAL(ch);
5858 if (decimal >= 0)
5859 *output++ = '0' + decimal;
5860 else if (0 < ch && ch < 256)
5861 *output++ = (char)ch;
5862 else {
5863 Py_DECREF(repunicode);
5864 raise_encode_exception(&exc, encoding,
5865 s, length, collstart-s, collend-s, reason);
5866 goto onError;
5867 }
5868 }
5869 }
5870 p = s + newpos;
5871 Py_DECREF(repunicode);
5872 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005873 }
5874 /* 0-terminate the output string */
5875 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(exc);
5877 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005878 return 0;
5879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881 Py_XDECREF(exc);
5882 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005883 return -1;
5884}
5885
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886/* --- Helpers ------------------------------------------------------------ */
5887
Eric Smith8c663262007-08-25 02:26:07 +00005888#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005889#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005890
Thomas Wouters477c8d52006-05-27 19:21:47 +00005891#include "stringlib/count.h"
5892#include "stringlib/find.h"
5893#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005894#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005895
Eric Smith5807c412008-05-11 21:00:57 +00005896#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005897#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005898#include "stringlib/localeutil.h"
5899
Thomas Wouters477c8d52006-05-27 19:21:47 +00005900/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005901#define ADJUST_INDICES(start, end, len) \
5902 if (end > len) \
5903 end = len; \
5904 else if (end < 0) { \
5905 end += len; \
5906 if (end < 0) \
5907 end = 0; \
5908 } \
5909 if (start < 0) { \
5910 start += len; \
5911 if (start < 0) \
5912 start = 0; \
5913 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005914
Martin v. Löwis18e16552006-02-15 17:27:45 +00005915Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005916 PyObject *substr,
5917 Py_ssize_t start,
5918 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005920 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005921 PyUnicodeObject* str_obj;
5922 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005923
Thomas Wouters477c8d52006-05-27 19:21:47 +00005924 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5925 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005927 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5928 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 Py_DECREF(str_obj);
5930 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 }
Tim Petersced69f82003-09-16 20:30:58 +00005932
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005933 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005934 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005935 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5936 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005937 );
5938
5939 Py_DECREF(sub_obj);
5940 Py_DECREF(str_obj);
5941
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 return result;
5943}
5944
Martin v. Löwis18e16552006-02-15 17:27:45 +00005945Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005946 PyObject *sub,
5947 Py_ssize_t start,
5948 Py_ssize_t end,
5949 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005951 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005952
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005954 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005956 sub = PyUnicode_FromObject(sub);
5957 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 Py_DECREF(str);
5959 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 }
Tim Petersced69f82003-09-16 20:30:58 +00005961
Thomas Wouters477c8d52006-05-27 19:21:47 +00005962 if (direction > 0)
5963 result = stringlib_find_slice(
5964 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5965 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5966 start, end
5967 );
5968 else
5969 result = stringlib_rfind_slice(
5970 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5971 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5972 start, end
5973 );
5974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005976 Py_DECREF(sub);
5977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 return result;
5979}
5980
Tim Petersced69f82003-09-16 20:30:58 +00005981static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 PyUnicodeObject *substring,
5984 Py_ssize_t start,
5985 Py_ssize_t end,
5986 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 if (substring->length == 0)
5989 return 1;
5990
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005991 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 end -= substring->length;
5993 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995
5996 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 if (Py_UNICODE_MATCH(self, end, substring))
5998 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 } else {
6000 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 }
6003
6004 return 0;
6005}
6006
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 PyObject *substr,
6009 Py_ssize_t start,
6010 Py_ssize_t end,
6011 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006013 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006014
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 str = PyUnicode_FromObject(str);
6016 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 substr = PyUnicode_FromObject(substr);
6019 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 Py_DECREF(str);
6021 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 }
Tim Petersced69f82003-09-16 20:30:58 +00006023
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 (PyUnicodeObject *)substr,
6026 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 Py_DECREF(str);
6028 Py_DECREF(substr);
6029 return result;
6030}
6031
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032/* Apply fixfct filter to the Unicode object self and return a
6033 reference to the modified object */
6034
Tim Petersced69f82003-09-16 20:30:58 +00006035static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038{
6039
6040 PyUnicodeObject *u;
6041
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006042 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006045
6046 Py_UNICODE_COPY(u->str, self->str, self->length);
6047
Tim Peters7a29bd52001-09-12 03:03:31 +00006048 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 /* fixfct should return TRUE if it modified the buffer. If
6050 FALSE, return a reference to the original buffer instead
6051 (to save space, not time) */
6052 Py_INCREF(self);
6053 Py_DECREF(u);
6054 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
6056 return (PyObject*) u;
6057}
6058
Tim Petersced69f82003-09-16 20:30:58 +00006059static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060int fixupper(PyUnicodeObject *self)
6061{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 Py_UNICODE *s = self->str;
6064 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006065
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006068
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 ch = Py_UNICODE_TOUPPER(*s);
6070 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 *s = ch;
6073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 s++;
6075 }
6076
6077 return status;
6078}
6079
Tim Petersced69f82003-09-16 20:30:58 +00006080static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081int fixlower(PyUnicodeObject *self)
6082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006083 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 Py_UNICODE *s = self->str;
6085 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006089
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 ch = Py_UNICODE_TOLOWER(*s);
6091 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 *s = ch;
6094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 s++;
6096 }
6097
6098 return status;
6099}
6100
Tim Petersced69f82003-09-16 20:30:58 +00006101static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102int fixswapcase(PyUnicodeObject *self)
6103{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006104 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 Py_UNICODE *s = self->str;
6106 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006107
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 while (len-- > 0) {
6109 if (Py_UNICODE_ISUPPER(*s)) {
6110 *s = Py_UNICODE_TOLOWER(*s);
6111 status = 1;
6112 } else if (Py_UNICODE_ISLOWER(*s)) {
6113 *s = Py_UNICODE_TOUPPER(*s);
6114 status = 1;
6115 }
6116 s++;
6117 }
6118
6119 return status;
6120}
6121
Tim Petersced69f82003-09-16 20:30:58 +00006122static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123int fixcapitalize(PyUnicodeObject *self)
6124{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006125 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006126 Py_UNICODE *s = self->str;
6127 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006128
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006129 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006131 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 *s = Py_UNICODE_TOUPPER(*s);
6133 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006135 s++;
6136 while (--len > 0) {
6137 if (Py_UNICODE_ISUPPER(*s)) {
6138 *s = Py_UNICODE_TOLOWER(*s);
6139 status = 1;
6140 }
6141 s++;
6142 }
6143 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144}
6145
6146static
6147int fixtitle(PyUnicodeObject *self)
6148{
6149 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6150 register Py_UNICODE *e;
6151 int previous_is_cased;
6152
6153 /* Shortcut for single character strings */
6154 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6156 if (*p != ch) {
6157 *p = ch;
6158 return 1;
6159 }
6160 else
6161 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 }
Tim Petersced69f82003-09-16 20:30:58 +00006163
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 e = p + PyUnicode_GET_SIZE(self);
6165 previous_is_cased = 0;
6166 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006168
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 if (previous_is_cased)
6170 *p = Py_UNICODE_TOLOWER(ch);
6171 else
6172 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006173
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 if (Py_UNICODE_ISLOWER(ch) ||
6175 Py_UNICODE_ISUPPER(ch) ||
6176 Py_UNICODE_ISTITLE(ch))
6177 previous_is_cased = 1;
6178 else
6179 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 }
6181 return 1;
6182}
6183
Tim Peters8ce9f162004-08-27 01:49:32 +00006184PyObject *
6185PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186{
Skip Montanaro6543b452004-09-16 03:28:13 +00006187 const Py_UNICODE blank = ' ';
6188 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006189 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006190 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006191 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6192 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006193 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6194 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006195 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006196 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Tim Peters05eba1f2004-08-27 21:32:02 +00006198 fseq = PySequence_Fast(seq, "");
6199 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006200 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006201 }
6202
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006203 /* NOTE: the following code can't call back into Python code,
6204 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006205 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006206
Tim Peters05eba1f2004-08-27 21:32:02 +00006207 seqlen = PySequence_Fast_GET_SIZE(fseq);
6208 /* If empty sequence, return u"". */
6209 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006210 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6211 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006212 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006213 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006214 /* If singleton sequence with an exact Unicode, return that. */
6215 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 item = items[0];
6217 if (PyUnicode_CheckExact(item)) {
6218 Py_INCREF(item);
6219 res = (PyUnicodeObject *)item;
6220 goto Done;
6221 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006222 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006223 else {
6224 /* Set up sep and seplen */
6225 if (separator == NULL) {
6226 sep = &blank;
6227 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006228 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006229 else {
6230 if (!PyUnicode_Check(separator)) {
6231 PyErr_Format(PyExc_TypeError,
6232 "separator: expected str instance,"
6233 " %.80s found",
6234 Py_TYPE(separator)->tp_name);
6235 goto onError;
6236 }
6237 sep = PyUnicode_AS_UNICODE(separator);
6238 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006239 }
6240 }
6241
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006242 /* There are at least two things to join, or else we have a subclass
6243 * of str in the sequence.
6244 * Do a pre-pass to figure out the total amount of space we'll
6245 * need (sz), and see whether all argument are strings.
6246 */
6247 sz = 0;
6248 for (i = 0; i < seqlen; i++) {
6249 const Py_ssize_t old_sz = sz;
6250 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 if (!PyUnicode_Check(item)) {
6252 PyErr_Format(PyExc_TypeError,
6253 "sequence item %zd: expected str instance,"
6254 " %.80s found",
6255 i, Py_TYPE(item)->tp_name);
6256 goto onError;
6257 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006258 sz += PyUnicode_GET_SIZE(item);
6259 if (i != 0)
6260 sz += seplen;
6261 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6262 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006264 goto onError;
6265 }
6266 }
Tim Petersced69f82003-09-16 20:30:58 +00006267
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006268 res = _PyUnicode_New(sz);
6269 if (res == NULL)
6270 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006271
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006272 /* Catenate everything. */
6273 res_p = PyUnicode_AS_UNICODE(res);
6274 for (i = 0; i < seqlen; ++i) {
6275 Py_ssize_t itemlen;
6276 item = items[i];
6277 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 /* Copy item, and maybe the separator. */
6279 if (i) {
6280 Py_UNICODE_COPY(res_p, sep, seplen);
6281 res_p += seplen;
6282 }
6283 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6284 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006285 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006288 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 return (PyObject *)res;
6290
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006292 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006293 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 return NULL;
6295}
6296
Tim Petersced69f82003-09-16 20:30:58 +00006297static
6298PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 Py_ssize_t left,
6300 Py_ssize_t right,
6301 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302{
6303 PyUnicodeObject *u;
6304
6305 if (left < 0)
6306 left = 0;
6307 if (right < 0)
6308 right = 0;
6309
Tim Peters7a29bd52001-09-12 03:03:31 +00006310 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 Py_INCREF(self);
6312 return self;
6313 }
6314
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006315 if (left > PY_SSIZE_T_MAX - self->length ||
6316 right > PY_SSIZE_T_MAX - (left + self->length)) {
6317 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6318 return NULL;
6319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 u = _PyUnicode_New(left + self->length + right);
6321 if (u) {
6322 if (left)
6323 Py_UNICODE_FILL(u->str, fill, left);
6324 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6325 if (right)
6326 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6327 }
6328
6329 return u;
6330}
6331
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006332PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335
6336 string = PyUnicode_FromObject(string);
6337 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006340 list = stringlib_splitlines(
6341 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6342 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
6344 Py_DECREF(string);
6345 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346}
6347
Tim Petersced69f82003-09-16 20:30:58 +00006348static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 PyUnicodeObject *substring,
6351 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006354 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006357 return stringlib_split_whitespace(
6358 (PyObject*) self, self->str, self->length, maxcount
6359 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006361 return stringlib_split(
6362 (PyObject*) self, self->str, self->length,
6363 substring->str, substring->length,
6364 maxcount
6365 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366}
6367
Tim Petersced69f82003-09-16 20:30:58 +00006368static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006369PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 PyUnicodeObject *substring,
6371 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006372{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006373 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006374 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006375
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006376 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006377 return stringlib_rsplit_whitespace(
6378 (PyObject*) self, self->str, self->length, maxcount
6379 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006380
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006381 return stringlib_rsplit(
6382 (PyObject*) self, self->str, self->length,
6383 substring->str, substring->length,
6384 maxcount
6385 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006386}
6387
6388static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 PyUnicodeObject *str1,
6391 PyUnicodeObject *str2,
6392 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
6394 PyUnicodeObject *u;
6395
6396 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006398 else if (maxcount == 0 || self->length == 0)
6399 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
Thomas Wouters477c8d52006-05-27 19:21:47 +00006401 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006402 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006403 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006404 if (str1->length == 0)
6405 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006406 if (str1->length == 1) {
6407 /* replace characters */
6408 Py_UNICODE u1, u2;
6409 if (!findchar(self->str, self->length, str1->str[0]))
6410 goto nothing;
6411 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6412 if (!u)
6413 return NULL;
6414 Py_UNICODE_COPY(u->str, self->str, self->length);
6415 u1 = str1->str[0];
6416 u2 = str2->str[0];
6417 for (i = 0; i < u->length; i++)
6418 if (u->str[i] == u1) {
6419 if (--maxcount < 0)
6420 break;
6421 u->str[i] = u2;
6422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006424 i = stringlib_find(
6425 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006427 if (i < 0)
6428 goto nothing;
6429 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6430 if (!u)
6431 return NULL;
6432 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006433
6434 /* change everything in-place, starting with this one */
6435 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6436 i += str1->length;
6437
6438 while ( --maxcount > 0) {
6439 i = stringlib_find(self->str+i, self->length-i,
6440 str1->str, str1->length,
6441 i);
6442 if (i == -1)
6443 break;
6444 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6445 i += str1->length;
6446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449
6450 Py_ssize_t n, i, j, e;
6451 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 Py_UNICODE *p;
6453
6454 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006455 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6456 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006457 if (n == 0)
6458 goto nothing;
6459 /* new_size = self->length + n * (str2->length - str1->length)); */
6460 delta = (str2->length - str1->length);
6461 if (delta == 0) {
6462 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006464 product = n * (str2->length - str1->length);
6465 if ((product / (str2->length - str1->length)) != n) {
6466 PyErr_SetString(PyExc_OverflowError,
6467 "replace string is too long");
6468 return NULL;
6469 }
6470 new_size = self->length + product;
6471 if (new_size < 0) {
6472 PyErr_SetString(PyExc_OverflowError,
6473 "replace string is too long");
6474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
6476 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 u = _PyUnicode_New(new_size);
6478 if (!u)
6479 return NULL;
6480 i = 0;
6481 p = u->str;
6482 e = self->length - str1->length;
6483 if (str1->length > 0) {
6484 while (n-- > 0) {
6485 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006486 j = stringlib_find(self->str+i, self->length-i,
6487 str1->str, str1->length,
6488 i);
6489 if (j == -1)
6490 break;
6491 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006492 /* copy unchanged part [i:j] */
6493 Py_UNICODE_COPY(p, self->str+i, j-i);
6494 p += j - i;
6495 }
6496 /* copy substitution string */
6497 if (str2->length > 0) {
6498 Py_UNICODE_COPY(p, str2->str, str2->length);
6499 p += str2->length;
6500 }
6501 i = j + str1->length;
6502 }
6503 if (i < self->length)
6504 /* copy tail [i:] */
6505 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6506 } else {
6507 /* interleave */
6508 while (n > 0) {
6509 Py_UNICODE_COPY(p, str2->str, str2->length);
6510 p += str2->length;
6511 if (--n <= 0)
6512 break;
6513 *p++ = self->str[i++];
6514 }
6515 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006521 /* nothing to replace; return original string (when possible) */
6522 if (PyUnicode_CheckExact(self)) {
6523 Py_INCREF(self);
6524 return (PyObject *) self;
6525 }
6526 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527}
6528
6529/* --- Unicode Object Methods --------------------------------------------- */
6530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006531PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533\n\
6534Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006535characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536
6537static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006538unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 return fixup(self, fixtitle);
6541}
6542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006543PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545\n\
6546Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006547have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548
6549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006550unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 return fixup(self, fixcapitalize);
6553}
6554
6555#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006556PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558\n\
6559Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006560normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561
6562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006563unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564{
6565 PyObject *list;
6566 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006567 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 /* Split into words */
6570 list = split(self, NULL, -1);
6571 if (!list)
6572 return NULL;
6573
6574 /* Capitalize each word */
6575 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6576 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 if (item == NULL)
6579 goto onError;
6580 Py_DECREF(PyList_GET_ITEM(list, i));
6581 PyList_SET_ITEM(list, i, item);
6582 }
6583
6584 /* Join the words to form a new string */
6585 item = PyUnicode_Join(NULL, list);
6586
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 Py_DECREF(list);
6589 return (PyObject *)item;
6590}
6591#endif
6592
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006593/* Argument converter. Coerces to a single unicode character */
6594
6595static int
6596convert_uc(PyObject *obj, void *addr)
6597{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6599 PyObject *uniobj;
6600 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006601
Benjamin Peterson14339b62009-01-31 16:36:08 +00006602 uniobj = PyUnicode_FromObject(obj);
6603 if (uniobj == NULL) {
6604 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006606 return 0;
6607 }
6608 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6609 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006611 Py_DECREF(uniobj);
6612 return 0;
6613 }
6614 unistr = PyUnicode_AS_UNICODE(uniobj);
6615 *fillcharloc = unistr[0];
6616 Py_DECREF(uniobj);
6617 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006618}
6619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006620PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006623Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006624done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625
6626static PyObject *
6627unicode_center(PyUnicodeObject *self, PyObject *args)
6628{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006629 Py_ssize_t marg, left;
6630 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006631 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632
Thomas Woutersde017742006-02-16 19:34:37 +00006633 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 return NULL;
6635
Tim Peters7a29bd52001-09-12 03:03:31 +00006636 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 Py_INCREF(self);
6638 return (PyObject*) self;
6639 }
6640
6641 marg = width - self->length;
6642 left = marg / 2 + (marg & width & 1);
6643
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006644 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645}
6646
Marc-André Lemburge5034372000-08-08 08:04:29 +00006647#if 0
6648
6649/* This code should go into some future Unicode collation support
6650 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006651 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006652
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006653/* speedy UTF-16 code point order comparison */
6654/* gleaned from: */
6655/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6656
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006657static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006658{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006659 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006660 0, 0, 0, 0, 0, 0, 0, 0,
6661 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006662 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006663};
6664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665static int
6666unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6667{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006668 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 Py_UNICODE *s1 = str1->str;
6671 Py_UNICODE *s2 = str2->str;
6672
6673 len1 = str1->length;
6674 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006675
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006677 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006678
6679 c1 = *s1++;
6680 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006681
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 if (c1 > (1<<11) * 26)
6683 c1 += utf16Fixup[c1>>11];
6684 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006685 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006686 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006687
6688 if (c1 != c2)
6689 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006690
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006691 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 }
6693
6694 return (len1 < len2) ? -1 : (len1 != len2);
6695}
6696
Marc-André Lemburge5034372000-08-08 08:04:29 +00006697#else
6698
6699static int
6700unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6701{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006702 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006703
6704 Py_UNICODE *s1 = str1->str;
6705 Py_UNICODE *s2 = str2->str;
6706
6707 len1 = str1->length;
6708 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006709
Marc-André Lemburge5034372000-08-08 08:04:29 +00006710 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006711 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006712
Fredrik Lundh45714e92001-06-26 16:39:36 +00006713 c1 = *s1++;
6714 c2 = *s2++;
6715
6716 if (c1 != c2)
6717 return (c1 < c2) ? -1 : 1;
6718
Marc-André Lemburge5034372000-08-08 08:04:29 +00006719 len1--; len2--;
6720 }
6721
6722 return (len1 < len2) ? -1 : (len1 != len2);
6723}
6724
6725#endif
6726
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006730 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6731 return unicode_compare((PyUnicodeObject *)left,
6732 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006733 PyErr_Format(PyExc_TypeError,
6734 "Can't compare %.100s and %.100s",
6735 left->ob_type->tp_name,
6736 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 return -1;
6738}
6739
Martin v. Löwis5b222132007-06-10 09:51:05 +00006740int
6741PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6742{
6743 int i;
6744 Py_UNICODE *id;
6745 assert(PyUnicode_Check(uni));
6746 id = PyUnicode_AS_UNICODE(uni);
6747 /* Compare Unicode string and source character set string */
6748 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 if (id[i] != str[i])
6750 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006751 /* This check keeps Python strings that end in '\0' from comparing equal
6752 to C strings identical up to that point. */
6753 if (PyUnicode_GET_SIZE(uni) != i)
6754 /* We'll say the Python string is longer. */
6755 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006756 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006758 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006760 return 0;
6761}
6762
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006763
Benjamin Peterson29060642009-01-31 22:14:21 +00006764#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006765 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006766
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006767PyObject *PyUnicode_RichCompare(PyObject *left,
6768 PyObject *right,
6769 int op)
6770{
6771 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006772
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006773 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6774 PyObject *v;
6775 if (((PyUnicodeObject *) left)->length !=
6776 ((PyUnicodeObject *) right)->length) {
6777 if (op == Py_EQ) {
6778 Py_INCREF(Py_False);
6779 return Py_False;
6780 }
6781 if (op == Py_NE) {
6782 Py_INCREF(Py_True);
6783 return Py_True;
6784 }
6785 }
6786 if (left == right)
6787 result = 0;
6788 else
6789 result = unicode_compare((PyUnicodeObject *)left,
6790 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006791
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006792 /* Convert the return value to a Boolean */
6793 switch (op) {
6794 case Py_EQ:
6795 v = TEST_COND(result == 0);
6796 break;
6797 case Py_NE:
6798 v = TEST_COND(result != 0);
6799 break;
6800 case Py_LE:
6801 v = TEST_COND(result <= 0);
6802 break;
6803 case Py_GE:
6804 v = TEST_COND(result >= 0);
6805 break;
6806 case Py_LT:
6807 v = TEST_COND(result == -1);
6808 break;
6809 case Py_GT:
6810 v = TEST_COND(result == 1);
6811 break;
6812 default:
6813 PyErr_BadArgument();
6814 return NULL;
6815 }
6816 Py_INCREF(v);
6817 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006819
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006820 Py_INCREF(Py_NotImplemented);
6821 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006822}
6823
Guido van Rossum403d68b2000-03-13 15:55:09 +00006824int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006826{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006827 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006828 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006829
6830 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006831 sub = PyUnicode_FromObject(element);
6832 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 PyErr_Format(PyExc_TypeError,
6834 "'in <string>' requires string as left operand, not %s",
6835 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006836 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006837 }
6838
Thomas Wouters477c8d52006-05-27 19:21:47 +00006839 str = PyUnicode_FromObject(container);
6840 if (!str) {
6841 Py_DECREF(sub);
6842 return -1;
6843 }
6844
6845 result = stringlib_contains_obj(str, sub);
6846
6847 Py_DECREF(str);
6848 Py_DECREF(sub);
6849
Guido van Rossum403d68b2000-03-13 15:55:09 +00006850 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006851}
6852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853/* Concat to string or Unicode object giving a new Unicode object. */
6854
6855PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857{
6858 PyUnicodeObject *u = NULL, *v = NULL, *w;
6859
6860 /* Coerce the two arguments */
6861 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6862 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6865 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
6868 /* Shortcuts */
6869 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 Py_DECREF(v);
6871 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 }
6873 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 Py_DECREF(u);
6875 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 }
6877
6878 /* Concat the two Unicode strings */
6879 w = _PyUnicode_New(u->length + v->length);
6880 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 Py_UNICODE_COPY(w->str, u->str, u->length);
6883 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6884
6885 Py_DECREF(u);
6886 Py_DECREF(v);
6887 return (PyObject *)w;
6888
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 Py_XDECREF(u);
6891 Py_XDECREF(v);
6892 return NULL;
6893}
6894
Walter Dörwald1ab83302007-05-18 17:15:44 +00006895void
6896PyUnicode_Append(PyObject **pleft, PyObject *right)
6897{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006898 PyObject *new;
6899 if (*pleft == NULL)
6900 return;
6901 if (right == NULL || !PyUnicode_Check(*pleft)) {
6902 Py_DECREF(*pleft);
6903 *pleft = NULL;
6904 return;
6905 }
6906 new = PyUnicode_Concat(*pleft, right);
6907 Py_DECREF(*pleft);
6908 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006909}
6910
6911void
6912PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6913{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006914 PyUnicode_Append(pleft, right);
6915 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006916}
6917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006918PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006921Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006922string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006923interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924
6925static PyObject *
6926unicode_count(PyUnicodeObject *self, PyObject *args)
6927{
6928 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006929 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006930 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 PyObject *result;
6932
Guido van Rossumb8872e62000-05-09 14:14:27 +00006933 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 return NULL;
6936
6937 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006938 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006941
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006942 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006943 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006944 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006945 substring->str, substring->length,
6946 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006947 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948
6949 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006950
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 return result;
6952}
6953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006954PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006957Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006958to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006959handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6961'xmlcharrefreplace' as well as any other name registered with\n\
6962codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963
6964static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006965unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966{
Benjamin Peterson308d6372009-09-18 21:42:35 +00006967 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 char *encoding = NULL;
6969 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006970 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006971
Benjamin Peterson308d6372009-09-18 21:42:35 +00006972 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6973 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006975 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006976 if (v == NULL)
6977 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006978 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006979 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006980 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006981 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006982 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006983 Py_DECREF(v);
6984 return NULL;
6985 }
6986 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006987
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006989 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006990}
6991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006992PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994\n\
6995Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006996If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997
6998static PyObject*
6999unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7000{
7001 Py_UNICODE *e;
7002 Py_UNICODE *p;
7003 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007004 Py_UNICODE *qe;
7005 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 PyUnicodeObject *u;
7007 int tabsize = 8;
7008
7009 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011
Thomas Wouters7e474022000-07-16 12:04:32 +00007012 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007013 i = 0; /* chars up to and including most recent \n or \r */
7014 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7015 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 for (p = self->str; p < e; p++)
7017 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 if (tabsize > 0) {
7019 incr = tabsize - (j % tabsize); /* cannot overflow */
7020 if (j > PY_SSIZE_T_MAX - incr)
7021 goto overflow1;
7022 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007023 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 if (j > PY_SSIZE_T_MAX - 1)
7027 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 j++;
7029 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 if (i > PY_SSIZE_T_MAX - j)
7031 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007033 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 }
7035 }
7036
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007037 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007039
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 /* Second pass: create output string and fill it */
7041 u = _PyUnicode_New(i + j);
7042 if (!u)
7043 return NULL;
7044
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007045 j = 0; /* same as in first pass */
7046 q = u->str; /* next output char */
7047 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048
7049 for (p = self->str; p < e; p++)
7050 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 if (tabsize > 0) {
7052 i = tabsize - (j % tabsize);
7053 j += i;
7054 while (i--) {
7055 if (q >= qe)
7056 goto overflow2;
7057 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007060 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 else {
7062 if (q >= qe)
7063 goto overflow2;
7064 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007065 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 if (*p == '\n' || *p == '\r')
7067 j = 0;
7068 }
7069
7070 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007071
7072 overflow2:
7073 Py_DECREF(u);
7074 overflow1:
7075 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077}
7078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007079PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081\n\
7082Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007083such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084arguments start and end are interpreted as in slice notation.\n\
7085\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007086Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087
7088static PyObject *
7089unicode_find(PyUnicodeObject *self, PyObject *args)
7090{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007091 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007092 Py_ssize_t start;
7093 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007094 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
Christian Heimes9cd17752007-11-18 19:35:23 +00007096 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098
Thomas Wouters477c8d52006-05-27 19:21:47 +00007099 result = stringlib_find_slice(
7100 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7101 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7102 start, end
7103 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104
7105 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007106
Christian Heimes217cfd12007-12-02 14:31:20 +00007107 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108}
7109
7110static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007111unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112{
7113 if (index < 0 || index >= self->length) {
7114 PyErr_SetString(PyExc_IndexError, "string index out of range");
7115 return NULL;
7116 }
7117
7118 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7119}
7120
Guido van Rossumc2504932007-09-18 19:42:40 +00007121/* Believe it or not, this produces the same value for ASCII strings
7122 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007124unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125{
Guido van Rossumc2504932007-09-18 19:42:40 +00007126 Py_ssize_t len;
7127 Py_UNICODE *p;
7128 long x;
7129
7130 if (self->hash != -1)
7131 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007132 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007133 p = self->str;
7134 x = *p << 7;
7135 while (--len >= 0)
7136 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007137 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007138 if (x == -1)
7139 x = -2;
7140 self->hash = x;
7141 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142}
7143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007147Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148
7149static PyObject *
7150unicode_index(PyUnicodeObject *self, PyObject *args)
7151{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007152 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007153 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007154 Py_ssize_t start;
7155 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
Christian Heimes9cd17752007-11-18 19:35:23 +00007157 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 result = stringlib_find_slice(
7161 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7162 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7163 start, end
7164 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165
7166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 if (result < 0) {
7169 PyErr_SetString(PyExc_ValueError, "substring not found");
7170 return NULL;
7171 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172
Christian Heimes217cfd12007-12-02 14:31:20 +00007173 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174}
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007179Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007180at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181
7182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007183unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184{
7185 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7186 register const Py_UNICODE *e;
7187 int cased;
7188
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 /* Shortcut for single character strings */
7190 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007193 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007194 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007196
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 e = p + PyUnicode_GET_SIZE(self);
7198 cased = 0;
7199 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007201
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7203 return PyBool_FromLong(0);
7204 else if (!cased && Py_UNICODE_ISLOWER(ch))
7205 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007207 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208}
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007213Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007214at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007217unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218{
7219 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7220 register const Py_UNICODE *e;
7221 int cased;
7222
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 /* Shortcut for single character strings */
7224 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007227 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007228 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007230
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 e = p + PyUnicode_GET_SIZE(self);
7232 cased = 0;
7233 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007235
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7237 return PyBool_FromLong(0);
7238 else if (!cased && Py_UNICODE_ISUPPER(ch))
7239 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007241 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242}
7243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007244PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007247Return True if S is a titlecased string and there is at least one\n\
7248character in S, i.e. upper- and titlecase characters may only\n\
7249follow uncased characters and lowercase characters only cased ones.\n\
7250Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251
7252static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007253unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254{
7255 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7256 register const Py_UNICODE *e;
7257 int cased, previous_is_cased;
7258
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 /* Shortcut for single character strings */
7260 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7262 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007264 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007265 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 e = p + PyUnicode_GET_SIZE(self);
7269 cased = 0;
7270 previous_is_cased = 0;
7271 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007273
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7275 if (previous_is_cased)
7276 return PyBool_FromLong(0);
7277 previous_is_cased = 1;
7278 cased = 1;
7279 }
7280 else if (Py_UNICODE_ISLOWER(ch)) {
7281 if (!previous_is_cased)
7282 return PyBool_FromLong(0);
7283 previous_is_cased = 1;
7284 cased = 1;
7285 }
7286 else
7287 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007289 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290}
7291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007295Return True if all characters in S are whitespace\n\
7296and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297
7298static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007299unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300{
7301 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7302 register const Py_UNICODE *e;
7303
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 /* Shortcut for single character strings */
7305 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 Py_UNICODE_ISSPACE(*p))
7307 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007309 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007310 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007312
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 e = p + PyUnicode_GET_SIZE(self);
7314 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 if (!Py_UNICODE_ISSPACE(*p))
7316 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007318 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319}
7320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007321PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007323\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007324Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007325and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007326
7327static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007328unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007329{
7330 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7331 register const Py_UNICODE *e;
7332
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007333 /* Shortcut for single character strings */
7334 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 Py_UNICODE_ISALPHA(*p))
7336 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007337
7338 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007339 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007341
7342 e = p + PyUnicode_GET_SIZE(self);
7343 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 if (!Py_UNICODE_ISALPHA(*p))
7345 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007346 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007347 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007348}
7349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007350PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007352\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007353Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007355
7356static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007357unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007358{
7359 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7360 register const Py_UNICODE *e;
7361
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007362 /* Shortcut for single character strings */
7363 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 Py_UNICODE_ISALNUM(*p))
7365 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007366
7367 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007368 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007370
7371 e = p + PyUnicode_GET_SIZE(self);
7372 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 if (!Py_UNICODE_ISALNUM(*p))
7374 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007375 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007376 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007377}
7378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007379PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007382Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007383False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384
7385static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007386unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387{
7388 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7389 register const Py_UNICODE *e;
7390
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 /* Shortcut for single character strings */
7392 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 Py_UNICODE_ISDECIMAL(*p))
7394 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007396 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007397 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007399
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 e = p + PyUnicode_GET_SIZE(self);
7401 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 if (!Py_UNICODE_ISDECIMAL(*p))
7403 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007405 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406}
7407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007408PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007411Return True if all characters in S are digits\n\
7412and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
7414static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007415unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416{
7417 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7418 register const Py_UNICODE *e;
7419
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 /* Shortcut for single character strings */
7421 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 Py_UNICODE_ISDIGIT(*p))
7423 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007425 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007426 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007428
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 e = p + PyUnicode_GET_SIZE(self);
7430 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 if (!Py_UNICODE_ISDIGIT(*p))
7432 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007434 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435}
7436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007440Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
7443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007444unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445{
7446 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7447 register const Py_UNICODE *e;
7448
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 /* Shortcut for single character strings */
7450 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 Py_UNICODE_ISNUMERIC(*p))
7452 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007454 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007455 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007457
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 e = p + PyUnicode_GET_SIZE(self);
7459 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 if (!Py_UNICODE_ISNUMERIC(*p))
7461 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007463 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464}
7465
Martin v. Löwis47383402007-08-15 07:32:56 +00007466int
7467PyUnicode_IsIdentifier(PyObject *self)
7468{
7469 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7470 register const Py_UNICODE *e;
7471
7472 /* Special case for empty strings */
7473 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007475
7476 /* PEP 3131 says that the first character must be in
7477 XID_Start and subsequent characters in XID_Continue,
7478 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007480 letters, digits, underscore). However, given the current
7481 definition of XID_Start and XID_Continue, it is sufficient
7482 to check just for these, except that _ must be allowed
7483 as starting an identifier. */
7484 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7485 return 0;
7486
7487 e = p + PyUnicode_GET_SIZE(self);
7488 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 if (!_PyUnicode_IsXidContinue(*p))
7490 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007491 }
7492 return 1;
7493}
7494
7495PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007497\n\
7498Return True if S is a valid identifier according\n\
7499to the language definition.");
7500
7501static PyObject*
7502unicode_isidentifier(PyObject *self)
7503{
7504 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7505}
7506
Georg Brandl559e5d72008-06-11 18:37:52 +00007507PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007509\n\
7510Return True if all characters in S are considered\n\
7511printable in repr() or S is empty, False otherwise.");
7512
7513static PyObject*
7514unicode_isprintable(PyObject *self)
7515{
7516 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7517 register const Py_UNICODE *e;
7518
7519 /* Shortcut for single character strings */
7520 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7521 Py_RETURN_TRUE;
7522 }
7523
7524 e = p + PyUnicode_GET_SIZE(self);
7525 for (; p < e; p++) {
7526 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7527 Py_RETURN_FALSE;
7528 }
7529 }
7530 Py_RETURN_TRUE;
7531}
7532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007533PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007534 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535\n\
7536Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007537iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538
7539static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007540unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007542 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543}
7544
Martin v. Löwis18e16552006-02-15 17:27:45 +00007545static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546unicode_length(PyUnicodeObject *self)
7547{
7548 return self->length;
7549}
7550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007554Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007555done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
7557static PyObject *
7558unicode_ljust(PyUnicodeObject *self, PyObject *args)
7559{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007560 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007561 Py_UNICODE fillchar = ' ';
7562
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007563 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 return NULL;
7565
Tim Peters7a29bd52001-09-12 03:03:31 +00007566 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 Py_INCREF(self);
7568 return (PyObject*) self;
7569 }
7570
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007571 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572}
7573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007574PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007577Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578
7579static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007580unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 return fixup(self, fixlower);
7583}
7584
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007585#define LEFTSTRIP 0
7586#define RIGHTSTRIP 1
7587#define BOTHSTRIP 2
7588
7589/* Arrays indexed by above */
7590static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7591
7592#define STRIPNAME(i) (stripformat[i]+3)
7593
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007594/* externally visible for str.strip(unicode) */
7595PyObject *
7596_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7597{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007598 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7599 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7600 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7601 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7602 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007603
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007605
Benjamin Peterson14339b62009-01-31 16:36:08 +00007606 i = 0;
7607 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7609 i++;
7610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007612
Benjamin Peterson14339b62009-01-31 16:36:08 +00007613 j = len;
7614 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 do {
7616 j--;
7617 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7618 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007619 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007620
Benjamin Peterson14339b62009-01-31 16:36:08 +00007621 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 Py_INCREF(self);
7623 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007624 }
7625 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007627}
7628
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
7630static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007631do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007633 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7634 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007635
Benjamin Peterson14339b62009-01-31 16:36:08 +00007636 i = 0;
7637 if (striptype != RIGHTSTRIP) {
7638 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7639 i++;
7640 }
7641 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007642
Benjamin Peterson14339b62009-01-31 16:36:08 +00007643 j = len;
7644 if (striptype != LEFTSTRIP) {
7645 do {
7646 j--;
7647 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7648 j++;
7649 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007650
Benjamin Peterson14339b62009-01-31 16:36:08 +00007651 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7652 Py_INCREF(self);
7653 return (PyObject*)self;
7654 }
7655 else
7656 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657}
7658
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007659
7660static PyObject *
7661do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7662{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007663 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007664
Benjamin Peterson14339b62009-01-31 16:36:08 +00007665 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7666 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007667
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668 if (sep != NULL && sep != Py_None) {
7669 if (PyUnicode_Check(sep))
7670 return _PyUnicode_XStrip(self, striptype, sep);
7671 else {
7672 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 "%s arg must be None or str",
7674 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 return NULL;
7676 }
7677 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007678
Benjamin Peterson14339b62009-01-31 16:36:08 +00007679 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007680}
7681
7682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007683PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007685\n\
7686Return a copy of the string S with leading and trailing\n\
7687whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007688If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007689
7690static PyObject *
7691unicode_strip(PyUnicodeObject *self, PyObject *args)
7692{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007693 if (PyTuple_GET_SIZE(args) == 0)
7694 return do_strip(self, BOTHSTRIP); /* Common case */
7695 else
7696 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007697}
7698
7699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007700PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007702\n\
7703Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007704If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007705
7706static PyObject *
7707unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7708{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007709 if (PyTuple_GET_SIZE(args) == 0)
7710 return do_strip(self, LEFTSTRIP); /* Common case */
7711 else
7712 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007713}
7714
7715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007718\n\
7719Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007720If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007721
7722static PyObject *
7723unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 if (PyTuple_GET_SIZE(args) == 0)
7726 return do_strip(self, RIGHTSTRIP); /* Common case */
7727 else
7728 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007729}
7730
7731
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007733unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734{
7735 PyUnicodeObject *u;
7736 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007737 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007738 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739
Georg Brandl222de0f2009-04-12 12:01:50 +00007740 if (len < 1) {
7741 Py_INCREF(unicode_empty);
7742 return (PyObject *)unicode_empty;
7743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744
Tim Peters7a29bd52001-09-12 03:03:31 +00007745 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 /* no repeat, return original string */
7747 Py_INCREF(str);
7748 return (PyObject*) str;
7749 }
Tim Peters8f422462000-09-09 06:13:41 +00007750
7751 /* ensure # of chars needed doesn't overflow int and # of bytes
7752 * needed doesn't overflow size_t
7753 */
7754 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007755 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007756 PyErr_SetString(PyExc_OverflowError,
7757 "repeated string is too long");
7758 return NULL;
7759 }
7760 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7761 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7762 PyErr_SetString(PyExc_OverflowError,
7763 "repeated string is too long");
7764 return NULL;
7765 }
7766 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767 if (!u)
7768 return NULL;
7769
7770 p = u->str;
7771
Georg Brandl222de0f2009-04-12 12:01:50 +00007772 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007773 Py_UNICODE_FILL(p, str->str[0], len);
7774 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007775 Py_ssize_t done = str->length; /* number of characters copied this far */
7776 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007778 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007779 Py_UNICODE_COPY(p+done, p, n);
7780 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 }
7783
7784 return (PyObject*) u;
7785}
7786
7787PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 PyObject *subobj,
7789 PyObject *replobj,
7790 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791{
7792 PyObject *self;
7793 PyObject *str1;
7794 PyObject *str2;
7795 PyObject *result;
7796
7797 self = PyUnicode_FromObject(obj);
7798 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800 str1 = PyUnicode_FromObject(subobj);
7801 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 Py_DECREF(self);
7803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 }
7805 str2 = PyUnicode_FromObject(replobj);
7806 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 Py_DECREF(self);
7808 Py_DECREF(str1);
7809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 }
Tim Petersced69f82003-09-16 20:30:58 +00007811 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 (PyUnicodeObject *)str1,
7813 (PyUnicodeObject *)str2,
7814 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 Py_DECREF(self);
7816 Py_DECREF(str1);
7817 Py_DECREF(str2);
7818 return result;
7819}
7820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007821PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823\n\
7824Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007825old replaced by new. If the optional argument count is\n\
7826given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827
7828static PyObject*
7829unicode_replace(PyUnicodeObject *self, PyObject *args)
7830{
7831 PyUnicodeObject *str1;
7832 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 PyObject *result;
7835
Martin v. Löwis18e16552006-02-15 17:27:45 +00007836 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837 return NULL;
7838 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7839 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007842 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 Py_DECREF(str1);
7844 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846
7847 result = replace(self, str1, str2, maxcount);
7848
7849 Py_DECREF(str1);
7850 Py_DECREF(str2);
7851 return result;
7852}
7853
7854static
7855PyObject *unicode_repr(PyObject *unicode)
7856{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007857 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007858 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007859 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7860 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7861
7862 /* XXX(nnorwitz): rather than over-allocating, it would be
7863 better to choose a different scheme. Perhaps scan the
7864 first N-chars of the string and allocate based on that size.
7865 */
7866 /* Initial allocation is based on the longest-possible unichr
7867 escape.
7868
7869 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7870 unichr, so in this case it's the longest unichr escape. In
7871 narrow (UTF-16) builds this is five chars per source unichr
7872 since there are two unichrs in the surrogate pair, so in narrow
7873 (UTF-16) builds it's not the longest unichr escape.
7874
7875 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7876 so in the narrow (UTF-16) build case it's the longest unichr
7877 escape.
7878 */
7879
Walter Dörwald1ab83302007-05-18 17:15:44 +00007880 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007882#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007884#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007886#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007888 if (repr == NULL)
7889 return NULL;
7890
Walter Dörwald1ab83302007-05-18 17:15:44 +00007891 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007892
7893 /* Add quote */
7894 *p++ = (findchar(s, size, '\'') &&
7895 !findchar(s, size, '"')) ? '"' : '\'';
7896 while (size-- > 0) {
7897 Py_UNICODE ch = *s++;
7898
7899 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007900 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007901 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007902 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007903 continue;
7904 }
7905
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007907 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007908 *p++ = '\\';
7909 *p++ = 't';
7910 }
7911 else if (ch == '\n') {
7912 *p++ = '\\';
7913 *p++ = 'n';
7914 }
7915 else if (ch == '\r') {
7916 *p++ = '\\';
7917 *p++ = 'r';
7918 }
7919
7920 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007921 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007922 *p++ = '\\';
7923 *p++ = 'x';
7924 *p++ = hexdigits[(ch >> 4) & 0x000F];
7925 *p++ = hexdigits[ch & 0x000F];
7926 }
7927
Georg Brandl559e5d72008-06-11 18:37:52 +00007928 /* Copy ASCII characters as-is */
7929 else if (ch < 0x7F) {
7930 *p++ = ch;
7931 }
7932
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007934 else {
7935 Py_UCS4 ucs = ch;
7936
7937#ifndef Py_UNICODE_WIDE
7938 Py_UNICODE ch2 = 0;
7939 /* Get code point from surrogate pair */
7940 if (size > 0) {
7941 ch2 = *s;
7942 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007944 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007946 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007947 size--;
7948 }
7949 }
7950#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007951 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007952 (categories Z* and C* except ASCII space)
7953 */
7954 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7955 /* Map 8-bit characters to '\xhh' */
7956 if (ucs <= 0xff) {
7957 *p++ = '\\';
7958 *p++ = 'x';
7959 *p++ = hexdigits[(ch >> 4) & 0x000F];
7960 *p++ = hexdigits[ch & 0x000F];
7961 }
7962 /* Map 21-bit characters to '\U00xxxxxx' */
7963 else if (ucs >= 0x10000) {
7964 *p++ = '\\';
7965 *p++ = 'U';
7966 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7967 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7968 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7969 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7970 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7971 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7972 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7973 *p++ = hexdigits[ucs & 0x0000000F];
7974 }
7975 /* Map 16-bit characters to '\uxxxx' */
7976 else {
7977 *p++ = '\\';
7978 *p++ = 'u';
7979 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7980 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7981 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7982 *p++ = hexdigits[ucs & 0x000F];
7983 }
7984 }
7985 /* Copy characters as-is */
7986 else {
7987 *p++ = ch;
7988#ifndef Py_UNICODE_WIDE
7989 if (ucs >= 0x10000)
7990 *p++ = ch2;
7991#endif
7992 }
7993 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007994 }
7995 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007996 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007997
7998 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007999 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008000 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001}
8002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008003PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005\n\
8006Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008007such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008arguments start and end are interpreted as in slice notation.\n\
8009\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008010Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011
8012static PyObject *
8013unicode_rfind(PyUnicodeObject *self, PyObject *args)
8014{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008015 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008016 Py_ssize_t start;
8017 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008018 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Christian Heimes9cd17752007-11-18 19:35:23 +00008020 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022
Thomas Wouters477c8d52006-05-27 19:21:47 +00008023 result = stringlib_rfind_slice(
8024 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8025 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8026 start, end
8027 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028
8029 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008030
Christian Heimes217cfd12007-12-02 14:31:20 +00008031 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032}
8033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008034PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008037Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038
8039static PyObject *
8040unicode_rindex(PyUnicodeObject *self, PyObject *args)
8041{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008042 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008043 Py_ssize_t start;
8044 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008045 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
Christian Heimes9cd17752007-11-18 19:35:23 +00008047 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049
Thomas Wouters477c8d52006-05-27 19:21:47 +00008050 result = stringlib_rfind_slice(
8051 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8052 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8053 start, end
8054 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055
8056 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008057
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 if (result < 0) {
8059 PyErr_SetString(PyExc_ValueError, "substring not found");
8060 return NULL;
8061 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008062 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063}
8064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008065PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008068Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008069done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070
8071static PyObject *
8072unicode_rjust(PyUnicodeObject *self, PyObject *args)
8073{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008074 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008075 Py_UNICODE fillchar = ' ';
8076
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008077 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078 return NULL;
8079
Tim Peters7a29bd52001-09-12 03:03:31 +00008080 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 Py_INCREF(self);
8082 return (PyObject*) self;
8083 }
8084
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008085 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086}
8087
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 PyObject *sep,
8090 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091{
8092 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008093
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 s = PyUnicode_FromObject(s);
8095 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008096 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 if (sep != NULL) {
8098 sep = PyUnicode_FromObject(sep);
8099 if (sep == NULL) {
8100 Py_DECREF(s);
8101 return NULL;
8102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 }
8104
8105 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8106
8107 Py_DECREF(s);
8108 Py_XDECREF(sep);
8109 return result;
8110}
8111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008112PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114\n\
8115Return a list of the words in S, using sep as the\n\
8116delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008117splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008118whitespace string is a separator and empty strings are\n\
8119removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
8121static PyObject*
8122unicode_split(PyUnicodeObject *self, PyObject *args)
8123{
8124 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008125 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Martin v. Löwis18e16552006-02-15 17:27:45 +00008127 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 return NULL;
8129
8130 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136}
8137
Thomas Wouters477c8d52006-05-27 19:21:47 +00008138PyObject *
8139PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8140{
8141 PyObject* str_obj;
8142 PyObject* sep_obj;
8143 PyObject* out;
8144
8145 str_obj = PyUnicode_FromObject(str_in);
8146 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008148 sep_obj = PyUnicode_FromObject(sep_in);
8149 if (!sep_obj) {
8150 Py_DECREF(str_obj);
8151 return NULL;
8152 }
8153
8154 out = stringlib_partition(
8155 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8156 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8157 );
8158
8159 Py_DECREF(sep_obj);
8160 Py_DECREF(str_obj);
8161
8162 return out;
8163}
8164
8165
8166PyObject *
8167PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8168{
8169 PyObject* str_obj;
8170 PyObject* sep_obj;
8171 PyObject* out;
8172
8173 str_obj = PyUnicode_FromObject(str_in);
8174 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008176 sep_obj = PyUnicode_FromObject(sep_in);
8177 if (!sep_obj) {
8178 Py_DECREF(str_obj);
8179 return NULL;
8180 }
8181
8182 out = stringlib_rpartition(
8183 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8184 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8185 );
8186
8187 Py_DECREF(sep_obj);
8188 Py_DECREF(str_obj);
8189
8190 return out;
8191}
8192
8193PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008195\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008196Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008197the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008198found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008199
8200static PyObject*
8201unicode_partition(PyUnicodeObject *self, PyObject *separator)
8202{
8203 return PyUnicode_Partition((PyObject *)self, separator);
8204}
8205
8206PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008207 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008209Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008211separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008212
8213static PyObject*
8214unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8215{
8216 return PyUnicode_RPartition((PyObject *)self, separator);
8217}
8218
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008219PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 PyObject *sep,
8221 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008222{
8223 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008225 s = PyUnicode_FromObject(s);
8226 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 if (sep != NULL) {
8229 sep = PyUnicode_FromObject(sep);
8230 if (sep == NULL) {
8231 Py_DECREF(s);
8232 return NULL;
8233 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008234 }
8235
8236 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8237
8238 Py_DECREF(s);
8239 Py_XDECREF(sep);
8240 return result;
8241}
8242
8243PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008245\n\
8246Return a list of the words in S, using sep as the\n\
8247delimiter string, starting at the end of the string and\n\
8248working to the front. If maxsplit is given, at most maxsplit\n\
8249splits are done. If sep is not specified, any whitespace string\n\
8250is a separator.");
8251
8252static PyObject*
8253unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8254{
8255 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008256 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008257
Martin v. Löwis18e16552006-02-15 17:27:45 +00008258 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008259 return NULL;
8260
8261 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008263 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008265 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008267}
8268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008269PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271\n\
8272Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008273Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008274is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275
8276static PyObject*
8277unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8278{
Guido van Rossum86662912000-04-11 15:38:46 +00008279 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280
Guido van Rossum86662912000-04-11 15:38:46 +00008281 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 return NULL;
8283
Guido van Rossum86662912000-04-11 15:38:46 +00008284 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285}
8286
8287static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008288PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289{
Walter Dörwald346737f2007-05-31 10:44:43 +00008290 if (PyUnicode_CheckExact(self)) {
8291 Py_INCREF(self);
8292 return self;
8293 } else
8294 /* Subtype -- return genuine unicode string with the same value. */
8295 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8296 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297}
8298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008299PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301\n\
8302Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008303and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
8305static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008306unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 return fixup(self, fixswapcase);
8309}
8310
Georg Brandlceee0772007-11-27 23:48:05 +00008311PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008313\n\
8314Return a translation table usable for str.translate().\n\
8315If there is only one argument, it must be a dictionary mapping Unicode\n\
8316ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008317Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008318If there are two arguments, they must be strings of equal length, and\n\
8319in the resulting dictionary, each character in x will be mapped to the\n\
8320character at the same position in y. If there is a third argument, it\n\
8321must be a string, whose characters will be mapped to None in the result.");
8322
8323static PyObject*
8324unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8325{
8326 PyObject *x, *y = NULL, *z = NULL;
8327 PyObject *new = NULL, *key, *value;
8328 Py_ssize_t i = 0;
8329 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008330
Georg Brandlceee0772007-11-27 23:48:05 +00008331 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8332 return NULL;
8333 new = PyDict_New();
8334 if (!new)
8335 return NULL;
8336 if (y != NULL) {
8337 /* x must be a string too, of equal length */
8338 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8339 if (!PyUnicode_Check(x)) {
8340 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8341 "be a string if there is a second argument");
8342 goto err;
8343 }
8344 if (PyUnicode_GET_SIZE(x) != ylen) {
8345 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8346 "arguments must have equal length");
8347 goto err;
8348 }
8349 /* create entries for translating chars in x to those in y */
8350 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008351 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8352 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008353 if (!key || !value)
8354 goto err;
8355 res = PyDict_SetItem(new, key, value);
8356 Py_DECREF(key);
8357 Py_DECREF(value);
8358 if (res < 0)
8359 goto err;
8360 }
8361 /* create entries for deleting chars in z */
8362 if (z != NULL) {
8363 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008364 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008365 if (!key)
8366 goto err;
8367 res = PyDict_SetItem(new, key, Py_None);
8368 Py_DECREF(key);
8369 if (res < 0)
8370 goto err;
8371 }
8372 }
8373 } else {
8374 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008375 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008376 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8377 "to maketrans it must be a dict");
8378 goto err;
8379 }
8380 /* copy entries into the new dict, converting string keys to int keys */
8381 while (PyDict_Next(x, &i, &key, &value)) {
8382 if (PyUnicode_Check(key)) {
8383 /* convert string keys to integer keys */
8384 PyObject *newkey;
8385 if (PyUnicode_GET_SIZE(key) != 1) {
8386 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8387 "table must be of length 1");
8388 goto err;
8389 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008390 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008391 if (!newkey)
8392 goto err;
8393 res = PyDict_SetItem(new, newkey, value);
8394 Py_DECREF(newkey);
8395 if (res < 0)
8396 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008397 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008398 /* just keep integer keys */
8399 if (PyDict_SetItem(new, key, value) < 0)
8400 goto err;
8401 } else {
8402 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8403 "be strings or integers");
8404 goto err;
8405 }
8406 }
8407 }
8408 return new;
8409 err:
8410 Py_DECREF(new);
8411 return NULL;
8412}
8413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008414PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416\n\
8417Return a copy of the string S, where all characters have been mapped\n\
8418through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008419Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008420Unmapped characters are left untouched. Characters mapped to None\n\
8421are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422
8423static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008424unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425{
Georg Brandlceee0772007-11-27 23:48:05 +00008426 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427}
8428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008429PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008432Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433
8434static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008435unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 return fixup(self, fixupper);
8438}
8439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008440PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008443Pad a numeric string S with zeros on the left, to fill a field\n\
8444of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445
8446static PyObject *
8447unicode_zfill(PyUnicodeObject *self, PyObject *args)
8448{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008449 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 PyUnicodeObject *u;
8451
Martin v. Löwis18e16552006-02-15 17:27:45 +00008452 Py_ssize_t width;
8453 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 return NULL;
8455
8456 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008457 if (PyUnicode_CheckExact(self)) {
8458 Py_INCREF(self);
8459 return (PyObject*) self;
8460 }
8461 else
8462 return PyUnicode_FromUnicode(
8463 PyUnicode_AS_UNICODE(self),
8464 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466 }
8467
8468 fill = width - self->length;
8469
8470 u = pad(self, fill, 0, '0');
8471
Walter Dörwald068325e2002-04-15 13:36:47 +00008472 if (u == NULL)
8473 return NULL;
8474
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 if (u->str[fill] == '+' || u->str[fill] == '-') {
8476 /* move sign to beginning of string */
8477 u->str[0] = u->str[fill];
8478 u->str[fill] = '0';
8479 }
8480
8481 return (PyObject*) u;
8482}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483
8484#if 0
8485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008486unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487{
Christian Heimes2202f872008-02-06 14:31:34 +00008488 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489}
8490#endif
8491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008492PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008495Return True if S starts with the specified prefix, False otherwise.\n\
8496With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008497With optional end, stop comparing S at that position.\n\
8498prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499
8500static PyObject *
8501unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008504 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008506 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008507 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008508 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008510 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8512 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008513 if (PyTuple_Check(subobj)) {
8514 Py_ssize_t i;
8515 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8516 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008518 if (substring == NULL)
8519 return NULL;
8520 result = tailmatch(self, substring, start, end, -1);
8521 Py_DECREF(substring);
8522 if (result) {
8523 Py_RETURN_TRUE;
8524 }
8525 }
8526 /* nothing matched */
8527 Py_RETURN_FALSE;
8528 }
8529 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008532 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008534 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535}
8536
8537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008538PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008541Return True if S ends with the specified suffix, False otherwise.\n\
8542With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008543With optional end, stop comparing S at that position.\n\
8544suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
8546static PyObject *
8547unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008550 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008552 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008553 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008554 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008556 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8558 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008559 if (PyTuple_Check(subobj)) {
8560 Py_ssize_t i;
8561 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8562 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008564 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008566 result = tailmatch(self, substring, start, end, +1);
8567 Py_DECREF(substring);
8568 if (result) {
8569 Py_RETURN_TRUE;
8570 }
8571 }
8572 Py_RETURN_FALSE;
8573 }
8574 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008578 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008580 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581}
8582
Eric Smith8c663262007-08-25 02:26:07 +00008583#include "stringlib/string_format.h"
8584
8585PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008587\n\
8588");
8589
Eric Smith4a7d76d2008-05-30 18:10:19 +00008590static PyObject *
8591unicode__format__(PyObject* self, PyObject* args)
8592{
8593 PyObject *format_spec;
8594
8595 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8596 return NULL;
8597
8598 return _PyUnicode_FormatAdvanced(self,
8599 PyUnicode_AS_UNICODE(format_spec),
8600 PyUnicode_GET_SIZE(format_spec));
8601}
8602
Eric Smith8c663262007-08-25 02:26:07 +00008603PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008605\n\
8606");
8607
8608static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008609unicode__sizeof__(PyUnicodeObject *v)
8610{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008611 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8612 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008613}
8614
8615PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008617
8618static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008619unicode_getnewargs(PyUnicodeObject *v)
8620{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008621 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008622}
8623
8624
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625static PyMethodDef unicode_methods[] = {
8626
8627 /* Order is according to common usage: often used methods should
8628 appear first, since lookup is done sequentially. */
8629
Benjamin Peterson308d6372009-09-18 21:42:35 +00008630 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008631 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8632 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008633 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008634 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8635 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8636 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8637 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8638 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8639 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8640 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008641 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008642 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8643 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8644 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008645 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008646 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8647 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8648 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008649 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008650 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008651 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008652 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008653 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8654 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8655 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8656 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8657 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8658 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8659 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8660 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8661 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8662 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8663 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8664 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8665 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8666 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008667 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008668 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008669 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008670 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008671 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008672 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8673 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008674 {"maketrans", (PyCFunction) unicode_maketrans,
8675 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008676 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008677#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008678 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679#endif
8680
8681#if 0
8682 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008683 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684#endif
8685
Benjamin Peterson14339b62009-01-31 16:36:08 +00008686 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 {NULL, NULL}
8688};
8689
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008690static PyObject *
8691unicode_mod(PyObject *v, PyObject *w)
8692{
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 if (!PyUnicode_Check(v)) {
8694 Py_INCREF(Py_NotImplemented);
8695 return Py_NotImplemented;
8696 }
8697 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008698}
8699
8700static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008701 0, /*nb_add*/
8702 0, /*nb_subtract*/
8703 0, /*nb_multiply*/
8704 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008705};
8706
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008708 (lenfunc) unicode_length, /* sq_length */
8709 PyUnicode_Concat, /* sq_concat */
8710 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8711 (ssizeargfunc) unicode_getitem, /* sq_item */
8712 0, /* sq_slice */
8713 0, /* sq_ass_item */
8714 0, /* sq_ass_slice */
8715 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716};
8717
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008718static PyObject*
8719unicode_subscript(PyUnicodeObject* self, PyObject* item)
8720{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008721 if (PyIndex_Check(item)) {
8722 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008723 if (i == -1 && PyErr_Occurred())
8724 return NULL;
8725 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008726 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008727 return unicode_getitem(self, i);
8728 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008729 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008730 Py_UNICODE* source_buf;
8731 Py_UNICODE* result_buf;
8732 PyObject* result;
8733
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008734 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008736 return NULL;
8737 }
8738
8739 if (slicelength <= 0) {
8740 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008741 } else if (start == 0 && step == 1 && slicelength == self->length &&
8742 PyUnicode_CheckExact(self)) {
8743 Py_INCREF(self);
8744 return (PyObject *)self;
8745 } else if (step == 1) {
8746 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008747 } else {
8748 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008749 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8750 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008751
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 if (result_buf == NULL)
8753 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008754
8755 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8756 result_buf[i] = source_buf[cur];
8757 }
Tim Petersced69f82003-09-16 20:30:58 +00008758
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008759 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008760 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008761 return result;
8762 }
8763 } else {
8764 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8765 return NULL;
8766 }
8767}
8768
8769static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 (lenfunc)unicode_length, /* mp_length */
8771 (binaryfunc)unicode_subscript, /* mp_subscript */
8772 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008773};
8774
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776/* Helpers for PyUnicode_Format() */
8777
8778static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008779getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008781 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 (*p_argidx)++;
8784 if (arglen < 0)
8785 return args;
8786 else
8787 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 }
8789 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 return NULL;
8792}
8793
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008794/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008796static PyObject *
8797formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008799 char *p;
8800 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008802
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 x = PyFloat_AsDouble(v);
8804 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008805 return NULL;
8806
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008809
Eric Smith0923d1d2009-04-16 20:16:10 +00008810 p = PyOS_double_to_string(x, type, prec,
8811 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008812 if (p == NULL)
8813 return NULL;
8814 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008815 PyMem_Free(p);
8816 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817}
8818
Tim Peters38fd5b62000-09-21 05:43:11 +00008819static PyObject*
8820formatlong(PyObject *val, int flags, int prec, int type)
8821{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008822 char *buf;
8823 int len;
8824 PyObject *str; /* temporary string object. */
8825 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008826
Benjamin Peterson14339b62009-01-31 16:36:08 +00008827 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8828 if (!str)
8829 return NULL;
8830 result = PyUnicode_FromStringAndSize(buf, len);
8831 Py_DECREF(str);
8832 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008833}
8834
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835static int
8836formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008837 size_t buflen,
8838 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008840 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008841 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 if (PyUnicode_GET_SIZE(v) == 1) {
8843 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8844 buf[1] = '\0';
8845 return 1;
8846 }
8847#ifndef Py_UNICODE_WIDE
8848 if (PyUnicode_GET_SIZE(v) == 2) {
8849 /* Decode a valid surrogate pair */
8850 int c0 = PyUnicode_AS_UNICODE(v)[0];
8851 int c1 = PyUnicode_AS_UNICODE(v)[1];
8852 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8853 0xDC00 <= c1 && c1 <= 0xDFFF) {
8854 buf[0] = c0;
8855 buf[1] = c1;
8856 buf[2] = '\0';
8857 return 2;
8858 }
8859 }
8860#endif
8861 goto onError;
8862 }
8863 else {
8864 /* Integer input truncated to a character */
8865 long x;
8866 x = PyLong_AsLong(v);
8867 if (x == -1 && PyErr_Occurred())
8868 goto onError;
8869
8870 if (x < 0 || x > 0x10ffff) {
8871 PyErr_SetString(PyExc_OverflowError,
8872 "%c arg not in range(0x110000)");
8873 return -1;
8874 }
8875
8876#ifndef Py_UNICODE_WIDE
8877 if (x > 0xffff) {
8878 x -= 0x10000;
8879 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8880 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8881 return 2;
8882 }
8883#endif
8884 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008885 buf[1] = '\0';
8886 return 1;
8887 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008888
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008890 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008892 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893}
8894
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008895/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008896 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008897*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008898#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008899
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902{
8903 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008904 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 int args_owned = 0;
8906 PyUnicodeObject *result = NULL;
8907 PyObject *dict = NULL;
8908 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008909
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 PyErr_BadInternalCall();
8912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913 }
8914 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008915 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 fmt = PyUnicode_AS_UNICODE(uformat);
8918 fmtcnt = PyUnicode_GET_SIZE(uformat);
8919
8920 reslen = rescnt = fmtcnt + 100;
8921 result = _PyUnicode_New(reslen);
8922 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 res = PyUnicode_AS_UNICODE(result);
8925
8926 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 arglen = PyTuple_Size(args);
8928 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929 }
8930 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 arglen = -1;
8932 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008934 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008935 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937
8938 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 if (*fmt != '%') {
8940 if (--rescnt < 0) {
8941 rescnt = fmtcnt + 100;
8942 reslen += rescnt;
8943 if (_PyUnicode_Resize(&result, reslen) < 0)
8944 goto onError;
8945 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8946 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008947 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008949 }
8950 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 /* Got a format specifier */
8952 int flags = 0;
8953 Py_ssize_t width = -1;
8954 int prec = -1;
8955 Py_UNICODE c = '\0';
8956 Py_UNICODE fill;
8957 int isnumok;
8958 PyObject *v = NULL;
8959 PyObject *temp = NULL;
8960 Py_UNICODE *pbuf;
8961 Py_UNICODE sign;
8962 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008963 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 fmt++;
8966 if (*fmt == '(') {
8967 Py_UNICODE *keystart;
8968 Py_ssize_t keylen;
8969 PyObject *key;
8970 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00008971
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 if (dict == NULL) {
8973 PyErr_SetString(PyExc_TypeError,
8974 "format requires a mapping");
8975 goto onError;
8976 }
8977 ++fmt;
8978 --fmtcnt;
8979 keystart = fmt;
8980 /* Skip over balanced parentheses */
8981 while (pcount > 0 && --fmtcnt >= 0) {
8982 if (*fmt == ')')
8983 --pcount;
8984 else if (*fmt == '(')
8985 ++pcount;
8986 fmt++;
8987 }
8988 keylen = fmt - keystart - 1;
8989 if (fmtcnt < 0 || pcount > 0) {
8990 PyErr_SetString(PyExc_ValueError,
8991 "incomplete format key");
8992 goto onError;
8993 }
8994#if 0
8995 /* keys are converted to strings using UTF-8 and
8996 then looked up since Python uses strings to hold
8997 variables names etc. in its namespaces and we
8998 wouldn't want to break common idioms. */
8999 key = PyUnicode_EncodeUTF8(keystart,
9000 keylen,
9001 NULL);
9002#else
9003 key = PyUnicode_FromUnicode(keystart, keylen);
9004#endif
9005 if (key == NULL)
9006 goto onError;
9007 if (args_owned) {
9008 Py_DECREF(args);
9009 args_owned = 0;
9010 }
9011 args = PyObject_GetItem(dict, key);
9012 Py_DECREF(key);
9013 if (args == NULL) {
9014 goto onError;
9015 }
9016 args_owned = 1;
9017 arglen = -1;
9018 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009019 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 while (--fmtcnt >= 0) {
9021 switch (c = *fmt++) {
9022 case '-': flags |= F_LJUST; continue;
9023 case '+': flags |= F_SIGN; continue;
9024 case ' ': flags |= F_BLANK; continue;
9025 case '#': flags |= F_ALT; continue;
9026 case '0': flags |= F_ZERO; continue;
9027 }
9028 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009029 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009030 if (c == '*') {
9031 v = getnextarg(args, arglen, &argidx);
9032 if (v == NULL)
9033 goto onError;
9034 if (!PyLong_Check(v)) {
9035 PyErr_SetString(PyExc_TypeError,
9036 "* wants int");
9037 goto onError;
9038 }
9039 width = PyLong_AsLong(v);
9040 if (width == -1 && PyErr_Occurred())
9041 goto onError;
9042 if (width < 0) {
9043 flags |= F_LJUST;
9044 width = -width;
9045 }
9046 if (--fmtcnt >= 0)
9047 c = *fmt++;
9048 }
9049 else if (c >= '0' && c <= '9') {
9050 width = c - '0';
9051 while (--fmtcnt >= 0) {
9052 c = *fmt++;
9053 if (c < '0' || c > '9')
9054 break;
9055 if ((width*10) / 10 != width) {
9056 PyErr_SetString(PyExc_ValueError,
9057 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009058 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 }
9060 width = width*10 + (c - '0');
9061 }
9062 }
9063 if (c == '.') {
9064 prec = 0;
9065 if (--fmtcnt >= 0)
9066 c = *fmt++;
9067 if (c == '*') {
9068 v = getnextarg(args, arglen, &argidx);
9069 if (v == NULL)
9070 goto onError;
9071 if (!PyLong_Check(v)) {
9072 PyErr_SetString(PyExc_TypeError,
9073 "* wants int");
9074 goto onError;
9075 }
9076 prec = PyLong_AsLong(v);
9077 if (prec == -1 && PyErr_Occurred())
9078 goto onError;
9079 if (prec < 0)
9080 prec = 0;
9081 if (--fmtcnt >= 0)
9082 c = *fmt++;
9083 }
9084 else if (c >= '0' && c <= '9') {
9085 prec = c - '0';
9086 while (--fmtcnt >= 0) {
9087 c = Py_CHARMASK(*fmt++);
9088 if (c < '0' || c > '9')
9089 break;
9090 if ((prec*10) / 10 != prec) {
9091 PyErr_SetString(PyExc_ValueError,
9092 "prec too big");
9093 goto onError;
9094 }
9095 prec = prec*10 + (c - '0');
9096 }
9097 }
9098 } /* prec */
9099 if (fmtcnt >= 0) {
9100 if (c == 'h' || c == 'l' || c == 'L') {
9101 if (--fmtcnt >= 0)
9102 c = *fmt++;
9103 }
9104 }
9105 if (fmtcnt < 0) {
9106 PyErr_SetString(PyExc_ValueError,
9107 "incomplete format");
9108 goto onError;
9109 }
9110 if (c != '%') {
9111 v = getnextarg(args, arglen, &argidx);
9112 if (v == NULL)
9113 goto onError;
9114 }
9115 sign = 0;
9116 fill = ' ';
9117 switch (c) {
9118
9119 case '%':
9120 pbuf = formatbuf;
9121 /* presume that buffer length is at least 1 */
9122 pbuf[0] = '%';
9123 len = 1;
9124 break;
9125
9126 case 's':
9127 case 'r':
9128 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009129 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 temp = v;
9131 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009132 }
9133 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 if (c == 's')
9135 temp = PyObject_Str(v);
9136 else if (c == 'r')
9137 temp = PyObject_Repr(v);
9138 else
9139 temp = PyObject_ASCII(v);
9140 if (temp == NULL)
9141 goto onError;
9142 if (PyUnicode_Check(temp))
9143 /* nothing to do */;
9144 else {
9145 Py_DECREF(temp);
9146 PyErr_SetString(PyExc_TypeError,
9147 "%s argument has non-string str()");
9148 goto onError;
9149 }
9150 }
9151 pbuf = PyUnicode_AS_UNICODE(temp);
9152 len = PyUnicode_GET_SIZE(temp);
9153 if (prec >= 0 && len > prec)
9154 len = prec;
9155 break;
9156
9157 case 'i':
9158 case 'd':
9159 case 'u':
9160 case 'o':
9161 case 'x':
9162 case 'X':
9163 if (c == 'i')
9164 c = 'd';
9165 isnumok = 0;
9166 if (PyNumber_Check(v)) {
9167 PyObject *iobj=NULL;
9168
9169 if (PyLong_Check(v)) {
9170 iobj = v;
9171 Py_INCREF(iobj);
9172 }
9173 else {
9174 iobj = PyNumber_Long(v);
9175 }
9176 if (iobj!=NULL) {
9177 if (PyLong_Check(iobj)) {
9178 isnumok = 1;
9179 temp = formatlong(iobj, flags, prec, c);
9180 Py_DECREF(iobj);
9181 if (!temp)
9182 goto onError;
9183 pbuf = PyUnicode_AS_UNICODE(temp);
9184 len = PyUnicode_GET_SIZE(temp);
9185 sign = 1;
9186 }
9187 else {
9188 Py_DECREF(iobj);
9189 }
9190 }
9191 }
9192 if (!isnumok) {
9193 PyErr_Format(PyExc_TypeError,
9194 "%%%c format: a number is required, "
9195 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9196 goto onError;
9197 }
9198 if (flags & F_ZERO)
9199 fill = '0';
9200 break;
9201
9202 case 'e':
9203 case 'E':
9204 case 'f':
9205 case 'F':
9206 case 'g':
9207 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009208 temp = formatfloat(v, flags, prec, c);
9209 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009211 pbuf = PyUnicode_AS_UNICODE(temp);
9212 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 sign = 1;
9214 if (flags & F_ZERO)
9215 fill = '0';
9216 break;
9217
9218 case 'c':
9219 pbuf = formatbuf;
9220 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9221 if (len < 0)
9222 goto onError;
9223 break;
9224
9225 default:
9226 PyErr_Format(PyExc_ValueError,
9227 "unsupported format character '%c' (0x%x) "
9228 "at index %zd",
9229 (31<=c && c<=126) ? (char)c : '?',
9230 (int)c,
9231 (Py_ssize_t)(fmt - 1 -
9232 PyUnicode_AS_UNICODE(uformat)));
9233 goto onError;
9234 }
9235 if (sign) {
9236 if (*pbuf == '-' || *pbuf == '+') {
9237 sign = *pbuf++;
9238 len--;
9239 }
9240 else if (flags & F_SIGN)
9241 sign = '+';
9242 else if (flags & F_BLANK)
9243 sign = ' ';
9244 else
9245 sign = 0;
9246 }
9247 if (width < len)
9248 width = len;
9249 if (rescnt - (sign != 0) < width) {
9250 reslen -= rescnt;
9251 rescnt = width + fmtcnt + 100;
9252 reslen += rescnt;
9253 if (reslen < 0) {
9254 Py_XDECREF(temp);
9255 PyErr_NoMemory();
9256 goto onError;
9257 }
9258 if (_PyUnicode_Resize(&result, reslen) < 0) {
9259 Py_XDECREF(temp);
9260 goto onError;
9261 }
9262 res = PyUnicode_AS_UNICODE(result)
9263 + reslen - rescnt;
9264 }
9265 if (sign) {
9266 if (fill != ' ')
9267 *res++ = sign;
9268 rescnt--;
9269 if (width > len)
9270 width--;
9271 }
9272 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9273 assert(pbuf[0] == '0');
9274 assert(pbuf[1] == c);
9275 if (fill != ' ') {
9276 *res++ = *pbuf++;
9277 *res++ = *pbuf++;
9278 }
9279 rescnt -= 2;
9280 width -= 2;
9281 if (width < 0)
9282 width = 0;
9283 len -= 2;
9284 }
9285 if (width > len && !(flags & F_LJUST)) {
9286 do {
9287 --rescnt;
9288 *res++ = fill;
9289 } while (--width > len);
9290 }
9291 if (fill == ' ') {
9292 if (sign)
9293 *res++ = sign;
9294 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9295 assert(pbuf[0] == '0');
9296 assert(pbuf[1] == c);
9297 *res++ = *pbuf++;
9298 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009299 }
9300 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 Py_UNICODE_COPY(res, pbuf, len);
9302 res += len;
9303 rescnt -= len;
9304 while (--width >= len) {
9305 --rescnt;
9306 *res++ = ' ';
9307 }
9308 if (dict && (argidx < arglen) && c != '%') {
9309 PyErr_SetString(PyExc_TypeError,
9310 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009311 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 goto onError;
9313 }
9314 Py_XDECREF(temp);
9315 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316 } /* until end */
9317 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 PyErr_SetString(PyExc_TypeError,
9319 "not all arguments converted during string formatting");
9320 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 }
9322
Thomas Woutersa96affe2006-03-12 00:29:36 +00009323 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009324 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327 }
9328 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329 return (PyObject *)result;
9330
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 Py_XDECREF(result);
9333 Py_DECREF(uformat);
9334 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336 }
9337 return NULL;
9338}
9339
Jeremy Hylton938ace62002-07-17 16:30:39 +00009340static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009341unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9342
Tim Peters6d6c1a32001-08-02 04:15:00 +00009343static PyObject *
9344unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9345{
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009347 static char *kwlist[] = {"object", "encoding", "errors", 0};
9348 char *encoding = NULL;
9349 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009350
Benjamin Peterson14339b62009-01-31 16:36:08 +00009351 if (type != &PyUnicode_Type)
9352 return unicode_subtype_new(type, args, kwds);
9353 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009355 return NULL;
9356 if (x == NULL)
9357 return (PyObject *)_PyUnicode_New(0);
9358 if (encoding == NULL && errors == NULL)
9359 return PyObject_Str(x);
9360 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009362}
9363
Guido van Rossume023fe02001-08-30 03:12:59 +00009364static PyObject *
9365unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9366{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009367 PyUnicodeObject *tmp, *pnew;
9368 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009369
Benjamin Peterson14339b62009-01-31 16:36:08 +00009370 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9371 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9372 if (tmp == NULL)
9373 return NULL;
9374 assert(PyUnicode_Check(tmp));
9375 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9376 if (pnew == NULL) {
9377 Py_DECREF(tmp);
9378 return NULL;
9379 }
9380 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9381 if (pnew->str == NULL) {
9382 _Py_ForgetReference((PyObject *)pnew);
9383 PyObject_Del(pnew);
9384 Py_DECREF(tmp);
9385 return PyErr_NoMemory();
9386 }
9387 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9388 pnew->length = n;
9389 pnew->hash = tmp->hash;
9390 Py_DECREF(tmp);
9391 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009392}
9393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009394PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009396\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009397Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009398encoding defaults to the current default string encoding.\n\
9399errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009400
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009401static PyObject *unicode_iter(PyObject *seq);
9402
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009404 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009405 "str", /* tp_name */
9406 sizeof(PyUnicodeObject), /* tp_size */
9407 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009409 (destructor)unicode_dealloc, /* tp_dealloc */
9410 0, /* tp_print */
9411 0, /* tp_getattr */
9412 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009413 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009414 unicode_repr, /* tp_repr */
9415 &unicode_as_number, /* tp_as_number */
9416 &unicode_as_sequence, /* tp_as_sequence */
9417 &unicode_as_mapping, /* tp_as_mapping */
9418 (hashfunc) unicode_hash, /* tp_hash*/
9419 0, /* tp_call*/
9420 (reprfunc) unicode_str, /* tp_str */
9421 PyObject_GenericGetAttr, /* tp_getattro */
9422 0, /* tp_setattro */
9423 0, /* tp_as_buffer */
9424 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009425 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009426 unicode_doc, /* tp_doc */
9427 0, /* tp_traverse */
9428 0, /* tp_clear */
9429 PyUnicode_RichCompare, /* tp_richcompare */
9430 0, /* tp_weaklistoffset */
9431 unicode_iter, /* tp_iter */
9432 0, /* tp_iternext */
9433 unicode_methods, /* tp_methods */
9434 0, /* tp_members */
9435 0, /* tp_getset */
9436 &PyBaseObject_Type, /* tp_base */
9437 0, /* tp_dict */
9438 0, /* tp_descr_get */
9439 0, /* tp_descr_set */
9440 0, /* tp_dictoffset */
9441 0, /* tp_init */
9442 0, /* tp_alloc */
9443 unicode_new, /* tp_new */
9444 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445};
9446
9447/* Initialize the Unicode implementation */
9448
Thomas Wouters78890102000-07-22 19:25:51 +00009449void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009451 int i;
9452
Thomas Wouters477c8d52006-05-27 19:21:47 +00009453 /* XXX - move this array to unicodectype.c ? */
9454 Py_UNICODE linebreak[] = {
9455 0x000A, /* LINE FEED */
9456 0x000D, /* CARRIAGE RETURN */
9457 0x001C, /* FILE SEPARATOR */
9458 0x001D, /* GROUP SEPARATOR */
9459 0x001E, /* RECORD SEPARATOR */
9460 0x0085, /* NEXT LINE */
9461 0x2028, /* LINE SEPARATOR */
9462 0x2029, /* PARAGRAPH SEPARATOR */
9463 };
9464
Fred Drakee4315f52000-05-09 19:53:39 +00009465 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009466 free_list = NULL;
9467 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009469 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009471
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009472 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009474 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009476
9477 /* initialize the linebreak bloom filter */
9478 bloom_linebreak = make_bloom_mask(
9479 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9480 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009481
9482 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483}
9484
9485/* Finalize the Unicode implementation */
9486
Christian Heimesa156e092008-02-16 07:38:31 +00009487int
9488PyUnicode_ClearFreeList(void)
9489{
9490 int freelist_size = numfree;
9491 PyUnicodeObject *u;
9492
9493 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 PyUnicodeObject *v = u;
9495 u = *(PyUnicodeObject **)u;
9496 if (v->str)
9497 PyObject_DEL(v->str);
9498 Py_XDECREF(v->defenc);
9499 PyObject_Del(v);
9500 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009501 }
9502 free_list = NULL;
9503 assert(numfree == 0);
9504 return freelist_size;
9505}
9506
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507void
Thomas Wouters78890102000-07-22 19:25:51 +00009508_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009510 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009512 Py_XDECREF(unicode_empty);
9513 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009514
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009515 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 if (unicode_latin1[i]) {
9517 Py_DECREF(unicode_latin1[i]);
9518 unicode_latin1[i] = NULL;
9519 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009520 }
Christian Heimesa156e092008-02-16 07:38:31 +00009521 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009523
Walter Dörwald16807132007-05-25 13:52:07 +00009524void
9525PyUnicode_InternInPlace(PyObject **p)
9526{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009527 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9528 PyObject *t;
9529 if (s == NULL || !PyUnicode_Check(s))
9530 Py_FatalError(
9531 "PyUnicode_InternInPlace: unicode strings only please!");
9532 /* If it's a subclass, we don't really know what putting
9533 it in the interned dict might do. */
9534 if (!PyUnicode_CheckExact(s))
9535 return;
9536 if (PyUnicode_CHECK_INTERNED(s))
9537 return;
9538 if (interned == NULL) {
9539 interned = PyDict_New();
9540 if (interned == NULL) {
9541 PyErr_Clear(); /* Don't leave an exception */
9542 return;
9543 }
9544 }
9545 /* It might be that the GetItem call fails even
9546 though the key is present in the dictionary,
9547 namely when this happens during a stack overflow. */
9548 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009550 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009551
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 if (t) {
9553 Py_INCREF(t);
9554 Py_DECREF(*p);
9555 *p = t;
9556 return;
9557 }
Walter Dörwald16807132007-05-25 13:52:07 +00009558
Benjamin Peterson14339b62009-01-31 16:36:08 +00009559 PyThreadState_GET()->recursion_critical = 1;
9560 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9561 PyErr_Clear();
9562 PyThreadState_GET()->recursion_critical = 0;
9563 return;
9564 }
9565 PyThreadState_GET()->recursion_critical = 0;
9566 /* The two references in interned are not counted by refcnt.
9567 The deallocator will take care of this */
9568 Py_REFCNT(s) -= 2;
9569 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009570}
9571
9572void
9573PyUnicode_InternImmortal(PyObject **p)
9574{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 PyUnicode_InternInPlace(p);
9576 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9577 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9578 Py_INCREF(*p);
9579 }
Walter Dörwald16807132007-05-25 13:52:07 +00009580}
9581
9582PyObject *
9583PyUnicode_InternFromString(const char *cp)
9584{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009585 PyObject *s = PyUnicode_FromString(cp);
9586 if (s == NULL)
9587 return NULL;
9588 PyUnicode_InternInPlace(&s);
9589 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009590}
9591
9592void _Py_ReleaseInternedUnicodeStrings(void)
9593{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009594 PyObject *keys;
9595 PyUnicodeObject *s;
9596 Py_ssize_t i, n;
9597 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009598
Benjamin Peterson14339b62009-01-31 16:36:08 +00009599 if (interned == NULL || !PyDict_Check(interned))
9600 return;
9601 keys = PyDict_Keys(interned);
9602 if (keys == NULL || !PyList_Check(keys)) {
9603 PyErr_Clear();
9604 return;
9605 }
Walter Dörwald16807132007-05-25 13:52:07 +00009606
Benjamin Peterson14339b62009-01-31 16:36:08 +00009607 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9608 detector, interned unicode strings are not forcibly deallocated;
9609 rather, we give them their stolen references back, and then clear
9610 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009611
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 n = PyList_GET_SIZE(keys);
9613 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009615 for (i = 0; i < n; i++) {
9616 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9617 switch (s->state) {
9618 case SSTATE_NOT_INTERNED:
9619 /* XXX Shouldn't happen */
9620 break;
9621 case SSTATE_INTERNED_IMMORTAL:
9622 Py_REFCNT(s) += 1;
9623 immortal_size += s->length;
9624 break;
9625 case SSTATE_INTERNED_MORTAL:
9626 Py_REFCNT(s) += 2;
9627 mortal_size += s->length;
9628 break;
9629 default:
9630 Py_FatalError("Inconsistent interned string state.");
9631 }
9632 s->state = SSTATE_NOT_INTERNED;
9633 }
9634 fprintf(stderr, "total size of all interned strings: "
9635 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9636 "mortal/immortal\n", mortal_size, immortal_size);
9637 Py_DECREF(keys);
9638 PyDict_Clear(interned);
9639 Py_DECREF(interned);
9640 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009641}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009642
9643
9644/********************* Unicode Iterator **************************/
9645
9646typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009647 PyObject_HEAD
9648 Py_ssize_t it_index;
9649 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009650} unicodeiterobject;
9651
9652static void
9653unicodeiter_dealloc(unicodeiterobject *it)
9654{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009655 _PyObject_GC_UNTRACK(it);
9656 Py_XDECREF(it->it_seq);
9657 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009658}
9659
9660static int
9661unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9662{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009663 Py_VISIT(it->it_seq);
9664 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009665}
9666
9667static PyObject *
9668unicodeiter_next(unicodeiterobject *it)
9669{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009670 PyUnicodeObject *seq;
9671 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009672
Benjamin Peterson14339b62009-01-31 16:36:08 +00009673 assert(it != NULL);
9674 seq = it->it_seq;
9675 if (seq == NULL)
9676 return NULL;
9677 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009678
Benjamin Peterson14339b62009-01-31 16:36:08 +00009679 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9680 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009682 if (item != NULL)
9683 ++it->it_index;
9684 return item;
9685 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009686
Benjamin Peterson14339b62009-01-31 16:36:08 +00009687 Py_DECREF(seq);
9688 it->it_seq = NULL;
9689 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009690}
9691
9692static PyObject *
9693unicodeiter_len(unicodeiterobject *it)
9694{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009695 Py_ssize_t len = 0;
9696 if (it->it_seq)
9697 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9698 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009699}
9700
9701PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9702
9703static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009704 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009705 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009707};
9708
9709PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009710 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9711 "str_iterator", /* tp_name */
9712 sizeof(unicodeiterobject), /* tp_basicsize */
9713 0, /* tp_itemsize */
9714 /* methods */
9715 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9716 0, /* tp_print */
9717 0, /* tp_getattr */
9718 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009719 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009720 0, /* tp_repr */
9721 0, /* tp_as_number */
9722 0, /* tp_as_sequence */
9723 0, /* tp_as_mapping */
9724 0, /* tp_hash */
9725 0, /* tp_call */
9726 0, /* tp_str */
9727 PyObject_GenericGetAttr, /* tp_getattro */
9728 0, /* tp_setattro */
9729 0, /* tp_as_buffer */
9730 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9731 0, /* tp_doc */
9732 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9733 0, /* tp_clear */
9734 0, /* tp_richcompare */
9735 0, /* tp_weaklistoffset */
9736 PyObject_SelfIter, /* tp_iter */
9737 (iternextfunc)unicodeiter_next, /* tp_iternext */
9738 unicodeiter_methods, /* tp_methods */
9739 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009740};
9741
9742static PyObject *
9743unicode_iter(PyObject *seq)
9744{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009745 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009746
Benjamin Peterson14339b62009-01-31 16:36:08 +00009747 if (!PyUnicode_Check(seq)) {
9748 PyErr_BadInternalCall();
9749 return NULL;
9750 }
9751 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9752 if (it == NULL)
9753 return NULL;
9754 it->it_index = 0;
9755 Py_INCREF(seq);
9756 it->it_seq = (PyUnicodeObject *)seq;
9757 _PyObject_GC_TRACK(it);
9758 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009759}
9760
Martin v. Löwis5b222132007-06-10 09:51:05 +00009761size_t
9762Py_UNICODE_strlen(const Py_UNICODE *u)
9763{
9764 int res = 0;
9765 while(*u++)
9766 res++;
9767 return res;
9768}
9769
9770Py_UNICODE*
9771Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9772{
9773 Py_UNICODE *u = s1;
9774 while ((*u++ = *s2++));
9775 return s1;
9776}
9777
9778Py_UNICODE*
9779Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9780{
9781 Py_UNICODE *u = s1;
9782 while ((*u++ = *s2++))
9783 if (n-- == 0)
9784 break;
9785 return s1;
9786}
9787
9788int
9789Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9790{
9791 while (*s1 && *s2 && *s1 == *s2)
9792 s1++, s2++;
9793 if (*s1 && *s2)
9794 return (*s1 < *s2) ? -1 : +1;
9795 if (*s1)
9796 return 1;
9797 if (*s2)
9798 return -1;
9799 return 0;
9800}
9801
9802Py_UNICODE*
9803Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9804{
9805 const Py_UNICODE *p;
9806 for (p = s; *p; p++)
9807 if (*p == c)
9808 return (Py_UNICODE*)p;
9809 return NULL;
9810}
9811
9812
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009813#ifdef __cplusplus
9814}
9815#endif