blob: b50293c89c6ad05a11f33ecaef203305b7fa6f1a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner158701d2010-04-22 19:41:01 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
172/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000173 0, 0, 1, 0, 0, 1, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000175/* 0x001C, * FILE SEPARATOR */
176/* 0x001D, * GROUP SEPARATOR */
177/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 1, 1, 1, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000183
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000192};
193
194
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000195Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000196PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000198#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000200#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 /* This is actually an illegal character, so it should
202 not be passed to unichr. */
203 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000204#endif
205}
206
Thomas Wouters477c8d52006-05-27 19:21:47 +0000207/* --- Bloom Filters ----------------------------------------------------- */
208
209/* stuff to implement simple "bloom filters" for Unicode characters.
210 to keep things simple, we use a single bitmask, using the least 5
211 bits from each unicode characters as the bit index. */
212
213/* the linebreak mask is set up by Unicode_Init below */
214
215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
219#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
220
Benjamin Peterson29060642009-01-31 22:14:21 +0000221#define BLOOM_LINEBREAK(ch) \
222 ((ch) < 128U ? ascii_linebreak[(ch)] : \
223 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
225Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
226{
227 /* calculate simple bloom-style bitmask for a given unicode string */
228
229 long mask;
230 Py_ssize_t i;
231
232 mask = 0;
233 for (i = 0; i < len; i++)
234 mask |= (1 << (ptr[i] & 0x1F));
235
236 return mask;
237}
238
239Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
240{
241 Py_ssize_t i;
242
243 for (i = 0; i < setlen; i++)
244 if (set[i] == chr)
245 return 1;
246
247 return 0;
248}
249
Benjamin Peterson29060642009-01-31 22:14:21 +0000250#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
252
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253/* --- Unicode Object ----------------------------------------------------- */
254
255static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258{
259 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000260
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000261 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000263 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 /* Resizing shared object (unicode_empty or single character
266 objects) in-place is not allowed. Use PyUnicode_Resize()
267 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 (unicode->length == 1 &&
271 unicode->str[0] < 256U &&
272 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000274 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 return -1;
276 }
277
Thomas Wouters477c8d52006-05-27 19:21:47 +0000278 /* We allocate one more byte to make sure the string is Ux0000 terminated.
279 The overallocation is also used by fastsearch, which assumes that it's
280 safe to look at str[length] (without making any assumptions about what
281 it contains). */
282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000284 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000285 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000287 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 PyErr_NoMemory();
289 return -1;
290 }
291 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293
Benjamin Peterson29060642009-01-31 22:14:21 +0000294 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000296 if (unicode->defenc) {
Georg Brandl1fa11af2010-08-01 21:03:01 +0000297 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 }
299 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000300
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 return 0;
302}
303
304/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000305 Ux0000 terminated; some code (e.g. new_identifier)
306 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307
308 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000309 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311*/
312
313static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000314PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315{
316 register PyUnicodeObject *unicode;
317
Thomas Wouters477c8d52006-05-27 19:21:47 +0000318 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 if (length == 0 && unicode_empty != NULL) {
320 Py_INCREF(unicode_empty);
321 return unicode_empty;
322 }
323
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000324 /* Ensure we won't overflow the size. */
325 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
326 return (PyUnicodeObject *)PyErr_NoMemory();
327 }
328
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000330 if (free_list) {
331 unicode = free_list;
332 free_list = *(PyUnicodeObject **)unicode;
333 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000334 if (unicode->str) {
335 /* Keep-Alive optimization: we only upsize the buffer,
336 never downsize it. */
337 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000338 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 PyObject_DEL(unicode->str);
340 unicode->str = NULL;
341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000343 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 }
347 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 }
349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000350 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000351 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 if (unicode == NULL)
353 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
355 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 }
357
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000358 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 PyErr_NoMemory();
360 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000362 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000363 * the caller fails before initializing str -- unicode_resize()
364 * reads str[0], and the Keep-Alive optimization can keep memory
365 * allocated for str alive across a call to unicode_dealloc(unicode).
366 * We don't want unicode_resize to read uninitialized memory in
367 * that case.
368 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000369 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000371 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000373 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000374 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000376
Benjamin Peterson29060642009-01-31 22:14:21 +0000377 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000378 /* XXX UNREF/NEWREF interface should be more symmetrical */
379 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000381 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383}
384
385static
Guido van Rossum9475a232001-10-05 20:51:39 +0000386void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387{
Walter Dörwald16807132007-05-25 13:52:07 +0000388 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000389 case SSTATE_NOT_INTERNED:
390 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000391
Benjamin Peterson29060642009-01-31 22:14:21 +0000392 case SSTATE_INTERNED_MORTAL:
393 /* revive dead object temporarily for DelItem */
394 Py_REFCNT(unicode) = 3;
395 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
396 Py_FatalError(
397 "deletion of interned string failed");
398 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000399
Benjamin Peterson29060642009-01-31 22:14:21 +0000400 case SSTATE_INTERNED_IMMORTAL:
401 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000402
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 default:
404 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000405 }
406
Guido van Rossum604ddf82001-12-06 20:03:56 +0000407 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000409 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000410 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
411 PyObject_DEL(unicode->str);
412 unicode->str = NULL;
413 unicode->length = 0;
414 }
415 if (unicode->defenc) {
Georg Brandl1fa11af2010-08-01 21:03:01 +0000416 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 }
418 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000419 *(PyUnicodeObject **)unicode = free_list;
420 free_list = unicode;
421 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000422 }
423 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 PyObject_DEL(unicode->str);
425 Py_XDECREF(unicode->defenc);
426 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428}
429
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000430static
431int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000432{
433 register PyUnicodeObject *v;
434
435 /* Argument checks */
436 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000437 PyErr_BadInternalCall();
438 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000439 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000440 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000441 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
445
446 /* Resizing unicode_empty and single character objects is not
447 possible since these are being shared. We simply return a fresh
448 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000449 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000450 (v == unicode_empty || v->length == 1)) {
451 PyUnicodeObject *w = _PyUnicode_New(length);
452 if (w == NULL)
453 return -1;
454 Py_UNICODE_COPY(w->str, v->str,
455 length < v->length ? length : v->length);
456 Py_DECREF(*unicode);
457 *unicode = w;
458 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
460
461 /* Note that we don't have to modify *unicode for unshared Unicode
462 objects, since we can modify them in-place. */
463 return unicode_resize(v, length);
464}
465
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000466int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
467{
468 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
469}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000470
Guido van Rossumd57fd912000-03-10 22:53:23 +0000471PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000472 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000473{
474 PyUnicodeObject *unicode;
475
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476 /* If the Unicode data is known at construction time, we can apply
477 some optimizations which share commonly used objects. */
478 if (u != NULL) {
479
Benjamin Peterson29060642009-01-31 22:14:21 +0000480 /* Optimization for empty strings */
481 if (size == 0 && unicode_empty != NULL) {
482 Py_INCREF(unicode_empty);
483 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000484 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000485
486 /* Single character Unicode objects in the Latin-1 range are
487 shared when using this constructor */
488 if (size == 1 && *u < 256) {
489 unicode = unicode_latin1[*u];
490 if (!unicode) {
491 unicode = _PyUnicode_New(1);
492 if (!unicode)
493 return NULL;
494 unicode->str[0] = *u;
495 unicode_latin1[*u] = unicode;
496 }
497 Py_INCREF(unicode);
498 return (PyObject *)unicode;
499 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000500 }
Tim Petersced69f82003-09-16 20:30:58 +0000501
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 unicode = _PyUnicode_New(size);
503 if (!unicode)
504 return NULL;
505
506 /* Copy the Unicode data into the new object */
507 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000508 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509
510 return (PyObject *)unicode;
511}
512
Walter Dörwaldd2034312007-05-18 16:29:38 +0000513PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514{
515 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000516
Benjamin Peterson14339b62009-01-31 16:36:08 +0000517 if (size < 0) {
518 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000519 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000520 return NULL;
521 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000522
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000523 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000524 some optimizations which share commonly used objects.
525 Also, this means the input must be UTF-8, so fall back to the
526 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 if (u != NULL) {
528
Benjamin Peterson29060642009-01-31 22:14:21 +0000529 /* Optimization for empty strings */
530 if (size == 0 && unicode_empty != NULL) {
531 Py_INCREF(unicode_empty);
532 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000533 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000534
535 /* Single characters are shared when using this constructor.
536 Restrict to ASCII, since the input must be UTF-8. */
537 if (size == 1 && Py_CHARMASK(*u) < 128) {
538 unicode = unicode_latin1[Py_CHARMASK(*u)];
539 if (!unicode) {
540 unicode = _PyUnicode_New(1);
541 if (!unicode)
542 return NULL;
543 unicode->str[0] = Py_CHARMASK(*u);
544 unicode_latin1[Py_CHARMASK(*u)] = unicode;
545 }
546 Py_INCREF(unicode);
547 return (PyObject *)unicode;
548 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000549
550 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000551 }
552
Walter Dörwald55507312007-05-18 13:12:10 +0000553 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000554 if (!unicode)
555 return NULL;
556
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000557 return (PyObject *)unicode;
558}
559
Walter Dörwaldd2034312007-05-18 16:29:38 +0000560PyObject *PyUnicode_FromString(const char *u)
561{
562 size_t size = strlen(u);
563 if (size > PY_SSIZE_T_MAX) {
564 PyErr_SetString(PyExc_OverflowError, "input too long");
565 return NULL;
566 }
567
568 return PyUnicode_FromStringAndSize(u, size);
569}
570
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571#ifdef HAVE_WCHAR_H
572
Mark Dickinson081dfee2009-03-18 14:47:41 +0000573#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
574# define CONVERT_WCHAR_TO_SURROGATES
575#endif
576
577#ifdef CONVERT_WCHAR_TO_SURROGATES
578
579/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
580 to convert from UTF32 to UTF16. */
581
582PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
583 Py_ssize_t size)
584{
585 PyUnicodeObject *unicode;
586 register Py_ssize_t i;
587 Py_ssize_t alloc;
588 const wchar_t *orig_w;
589
590 if (w == NULL) {
591 if (size == 0)
592 return PyUnicode_FromStringAndSize(NULL, 0);
593 PyErr_BadInternalCall();
594 return NULL;
595 }
596
597 if (size == -1) {
598 size = wcslen(w);
599 }
600
601 alloc = size;
602 orig_w = w;
603 for (i = size; i > 0; i--) {
604 if (*w > 0xFFFF)
605 alloc++;
606 w++;
607 }
608 w = orig_w;
609 unicode = _PyUnicode_New(alloc);
610 if (!unicode)
611 return NULL;
612
613 /* Copy the wchar_t data into the new object */
614 {
615 register Py_UNICODE *u;
616 u = PyUnicode_AS_UNICODE(unicode);
617 for (i = size; i > 0; i--) {
618 if (*w > 0xFFFF) {
619 wchar_t ordinal = *w++;
620 ordinal -= 0x10000;
621 *u++ = 0xD800 | (ordinal >> 10);
622 *u++ = 0xDC00 | (ordinal & 0x3FF);
623 }
624 else
625 *u++ = *w++;
626 }
627 }
628 return (PyObject *)unicode;
629}
630
631#else
632
Guido van Rossumd57fd912000-03-10 22:53:23 +0000633PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000634 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635{
636 PyUnicodeObject *unicode;
637
638 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000639 if (size == 0)
640 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000641 PyErr_BadInternalCall();
642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643 }
644
Martin v. Löwis790465f2008-04-05 20:41:37 +0000645 if (size == -1) {
646 size = wcslen(w);
647 }
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649 unicode = _PyUnicode_New(size);
650 if (!unicode)
651 return NULL;
652
653 /* Copy the wchar_t data into the new object */
654#ifdef HAVE_USABLE_WCHAR_T
655 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000656#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000658 register Py_UNICODE *u;
659 register Py_ssize_t i;
660 u = PyUnicode_AS_UNICODE(unicode);
661 for (i = size; i > 0; i--)
662 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 }
664#endif
665
666 return (PyObject *)unicode;
667}
668
Mark Dickinson081dfee2009-03-18 14:47:41 +0000669#endif /* CONVERT_WCHAR_TO_SURROGATES */
670
671#undef CONVERT_WCHAR_TO_SURROGATES
672
Walter Dörwald346737f2007-05-31 10:44:43 +0000673static void
674makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
675{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000676 *fmt++ = '%';
677 if (width) {
678 if (zeropad)
679 *fmt++ = '0';
680 fmt += sprintf(fmt, "%d", width);
681 }
682 if (precision)
683 fmt += sprintf(fmt, ".%d", precision);
684 if (longflag)
685 *fmt++ = 'l';
686 else if (size_tflag) {
687 char *f = PY_FORMAT_SIZE_T;
688 while (*f)
689 *fmt++ = *f++;
690 }
691 *fmt++ = c;
692 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000693}
694
Walter Dörwaldd2034312007-05-18 16:29:38 +0000695#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
696
697PyObject *
698PyUnicode_FromFormatV(const char *format, va_list vargs)
699{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000700 va_list count;
701 Py_ssize_t callcount = 0;
702 PyObject **callresults = NULL;
703 PyObject **callresult = NULL;
704 Py_ssize_t n = 0;
705 int width = 0;
706 int precision = 0;
707 int zeropad;
708 const char* f;
709 Py_UNICODE *s;
710 PyObject *string;
711 /* used by sprintf */
712 char buffer[21];
713 /* use abuffer instead of buffer, if we need more space
714 * (which can happen if there's a format specifier with width). */
715 char *abuffer = NULL;
716 char *realbuffer;
717 Py_ssize_t abuffersize = 0;
718 char fmt[60]; /* should be enough for %0width.precisionld */
719 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000720
721#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000722 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000723#else
724#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000726#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000727 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728#endif
729#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000730 /* step 1: count the number of %S/%R/%A/%s format specifications
731 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
732 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
733 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000734 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000735 if (*f == '%') {
736 if (*(f+1)=='%')
737 continue;
738 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
739 ++callcount;
740 while (ISDIGIT((unsigned)*f))
741 width = (width*10) + *f++ - '0';
742 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
743 ;
744 if (*f == 's')
745 ++callcount;
746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 }
748 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000749 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000750 if (callcount) {
751 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
752 if (!callresults) {
753 PyErr_NoMemory();
754 return NULL;
755 }
756 callresult = callresults;
757 }
758 /* step 3: figure out how large a buffer we need */
759 for (f = format; *f; f++) {
760 if (*f == '%') {
761 const char* p = f;
762 width = 0;
763 while (ISDIGIT((unsigned)*f))
764 width = (width*10) + *f++ - '0';
765 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
766 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000767
Benjamin Peterson14339b62009-01-31 16:36:08 +0000768 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
769 * they don't affect the amount of space we reserve.
770 */
771 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000772 (f[1] == 'd' || f[1] == 'u'))
773 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000774
Benjamin Peterson14339b62009-01-31 16:36:08 +0000775 switch (*f) {
776 case 'c':
777 (void)va_arg(count, int);
778 /* fall through... */
779 case '%':
780 n++;
781 break;
782 case 'd': case 'u': case 'i': case 'x':
783 (void) va_arg(count, int);
784 /* 20 bytes is enough to hold a 64-bit
785 integer. Decimal takes the most space.
786 This isn't enough for octal.
787 If a width is specified we need more
788 (which we allocate later). */
789 if (width < 20)
790 width = 20;
791 n += width;
792 if (abuffersize < width)
793 abuffersize = width;
794 break;
795 case 's':
796 {
797 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000798 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000799 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
800 if (!str)
801 goto fail;
802 n += PyUnicode_GET_SIZE(str);
803 /* Remember the str and switch to the next slot */
804 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000805 break;
806 }
807 case 'U':
808 {
809 PyObject *obj = va_arg(count, PyObject *);
810 assert(obj && PyUnicode_Check(obj));
811 n += PyUnicode_GET_SIZE(obj);
812 break;
813 }
814 case 'V':
815 {
816 PyObject *obj = va_arg(count, PyObject *);
817 const char *str = va_arg(count, const char *);
818 assert(obj || str);
819 assert(!obj || PyUnicode_Check(obj));
820 if (obj)
821 n += PyUnicode_GET_SIZE(obj);
822 else
823 n += strlen(str);
824 break;
825 }
826 case 'S':
827 {
828 PyObject *obj = va_arg(count, PyObject *);
829 PyObject *str;
830 assert(obj);
831 str = PyObject_Str(obj);
832 if (!str)
833 goto fail;
834 n += PyUnicode_GET_SIZE(str);
835 /* Remember the str and switch to the next slot */
836 *callresult++ = str;
837 break;
838 }
839 case 'R':
840 {
841 PyObject *obj = va_arg(count, PyObject *);
842 PyObject *repr;
843 assert(obj);
844 repr = PyObject_Repr(obj);
845 if (!repr)
846 goto fail;
847 n += PyUnicode_GET_SIZE(repr);
848 /* Remember the repr and switch to the next slot */
849 *callresult++ = repr;
850 break;
851 }
852 case 'A':
853 {
854 PyObject *obj = va_arg(count, PyObject *);
855 PyObject *ascii;
856 assert(obj);
857 ascii = PyObject_ASCII(obj);
858 if (!ascii)
859 goto fail;
860 n += PyUnicode_GET_SIZE(ascii);
861 /* Remember the repr and switch to the next slot */
862 *callresult++ = ascii;
863 break;
864 }
865 case 'p':
866 (void) va_arg(count, int);
867 /* maximum 64-bit pointer representation:
868 * 0xffffffffffffffff
869 * so 19 characters is enough.
870 * XXX I count 18 -- what's the extra for?
871 */
872 n += 19;
873 break;
874 default:
875 /* if we stumble upon an unknown
876 formatting code, copy the rest of
877 the format string to the output
878 string. (we cannot just skip the
879 code, since there's no way to know
880 what's in the argument list) */
881 n += strlen(p);
882 goto expand;
883 }
884 } else
885 n++;
886 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000887 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000888 if (abuffersize > 20) {
889 abuffer = PyObject_Malloc(abuffersize);
890 if (!abuffer) {
891 PyErr_NoMemory();
892 goto fail;
893 }
894 realbuffer = abuffer;
895 }
896 else
897 realbuffer = buffer;
898 /* step 4: fill the buffer */
899 /* Since we've analyzed how much space we need for the worst case,
900 we don't have to resize the string.
901 There can be no errors beyond this point. */
902 string = PyUnicode_FromUnicode(NULL, n);
903 if (!string)
904 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000905
Benjamin Peterson14339b62009-01-31 16:36:08 +0000906 s = PyUnicode_AS_UNICODE(string);
907 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000908
Benjamin Peterson14339b62009-01-31 16:36:08 +0000909 for (f = format; *f; f++) {
910 if (*f == '%') {
911 const char* p = f++;
912 int longflag = 0;
913 int size_tflag = 0;
914 zeropad = (*f == '0');
915 /* parse the width.precision part */
916 width = 0;
917 while (ISDIGIT((unsigned)*f))
918 width = (width*10) + *f++ - '0';
919 precision = 0;
920 if (*f == '.') {
921 f++;
922 while (ISDIGIT((unsigned)*f))
923 precision = (precision*10) + *f++ - '0';
924 }
925 /* handle the long flag, but only for %ld and %lu.
926 others can be added when necessary. */
927 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
928 longflag = 1;
929 ++f;
930 }
931 /* handle the size_t flag. */
932 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
933 size_tflag = 1;
934 ++f;
935 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000936
Benjamin Peterson14339b62009-01-31 16:36:08 +0000937 switch (*f) {
938 case 'c':
939 *s++ = va_arg(vargs, int);
940 break;
941 case 'd':
942 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
943 if (longflag)
944 sprintf(realbuffer, fmt, va_arg(vargs, long));
945 else if (size_tflag)
946 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
947 else
948 sprintf(realbuffer, fmt, va_arg(vargs, int));
949 appendstring(realbuffer);
950 break;
951 case 'u':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
959 appendstring(realbuffer);
960 break;
961 case 'i':
962 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
963 sprintf(realbuffer, fmt, va_arg(vargs, int));
964 appendstring(realbuffer);
965 break;
966 case 'x':
967 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
968 sprintf(realbuffer, fmt, va_arg(vargs, int));
969 appendstring(realbuffer);
970 break;
971 case 's':
972 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000973 /* unused, since we already have the result */
974 (void) va_arg(vargs, char *);
975 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
976 PyUnicode_GET_SIZE(*callresult));
977 s += PyUnicode_GET_SIZE(*callresult);
978 /* We're done with the unicode()/repr() => forget it */
979 Py_DECREF(*callresult);
980 /* switch to next unicode()/repr() result */
981 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 break;
983 }
984 case 'U':
985 {
986 PyObject *obj = va_arg(vargs, PyObject *);
987 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
988 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
989 s += size;
990 break;
991 }
992 case 'V':
993 {
994 PyObject *obj = va_arg(vargs, PyObject *);
995 const char *str = va_arg(vargs, const char *);
996 if (obj) {
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 } else {
1001 appendstring(str);
1002 }
1003 break;
1004 }
1005 case 'S':
1006 case 'R':
1007 {
1008 Py_UNICODE *ucopy;
1009 Py_ssize_t usize;
1010 Py_ssize_t upos;
1011 /* unused, since we already have the result */
1012 (void) va_arg(vargs, PyObject *);
1013 ucopy = PyUnicode_AS_UNICODE(*callresult);
1014 usize = PyUnicode_GET_SIZE(*callresult);
1015 for (upos = 0; upos<usize;)
1016 *s++ = ucopy[upos++];
1017 /* We're done with the unicode()/repr() => forget it */
1018 Py_DECREF(*callresult);
1019 /* switch to next unicode()/repr() result */
1020 ++callresult;
1021 break;
1022 }
1023 case 'p':
1024 sprintf(buffer, "%p", va_arg(vargs, void*));
1025 /* %p is ill-defined: ensure leading 0x. */
1026 if (buffer[1] == 'X')
1027 buffer[1] = 'x';
1028 else if (buffer[1] != 'x') {
1029 memmove(buffer+2, buffer, strlen(buffer)+1);
1030 buffer[0] = '0';
1031 buffer[1] = 'x';
1032 }
1033 appendstring(buffer);
1034 break;
1035 case '%':
1036 *s++ = '%';
1037 break;
1038 default:
1039 appendstring(p);
1040 goto end;
1041 }
1042 } else
1043 *s++ = *f;
1044 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001045
Benjamin Peterson29060642009-01-31 22:14:21 +00001046 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001047 if (callresults)
1048 PyObject_Free(callresults);
1049 if (abuffer)
1050 PyObject_Free(abuffer);
1051 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1052 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001053 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001054 if (callresults) {
1055 PyObject **callresult2 = callresults;
1056 while (callresult2 < callresult) {
1057 Py_DECREF(*callresult2);
1058 ++callresult2;
1059 }
1060 PyObject_Free(callresults);
1061 }
1062 if (abuffer)
1063 PyObject_Free(abuffer);
1064 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001065}
1066
1067#undef appendstring
1068
1069PyObject *
1070PyUnicode_FromFormat(const char *format, ...)
1071{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 PyObject* ret;
1073 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001074
1075#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001076 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001077#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001080 ret = PyUnicode_FromFormatV(format, vargs);
1081 va_end(vargs);
1082 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001083}
1084
Martin v. Löwis18e16552006-02-15 17:27:45 +00001085Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001086 wchar_t *w,
1087 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088{
1089 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001090 PyErr_BadInternalCall();
1091 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001093
1094 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001096 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098#ifdef HAVE_USABLE_WCHAR_T
1099 memcpy(w, unicode->str, size * sizeof(wchar_t));
1100#else
1101 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001102 register Py_UNICODE *u;
1103 register Py_ssize_t i;
1104 u = PyUnicode_AS_UNICODE(unicode);
1105 for (i = size; i > 0; i--)
1106 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107 }
1108#endif
1109
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001110 if (size > PyUnicode_GET_SIZE(unicode))
1111 return PyUnicode_GET_SIZE(unicode);
1112 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001113 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114}
1115
1116#endif
1117
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001118PyObject *PyUnicode_FromOrdinal(int ordinal)
1119{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001120 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001121
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001122 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_SetString(PyExc_ValueError,
1124 "chr() arg not in range(0x110000)");
1125 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001126 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001127
1128#ifndef Py_UNICODE_WIDE
1129 if (ordinal > 0xffff) {
1130 ordinal -= 0x10000;
1131 s[0] = 0xD800 | (ordinal >> 10);
1132 s[1] = 0xDC00 | (ordinal & 0x3FF);
1133 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001134 }
1135#endif
1136
Hye-Shik Chang40574832004-04-06 07:24:51 +00001137 s[0] = (Py_UNICODE)ordinal;
1138 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001139}
1140
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141PyObject *PyUnicode_FromObject(register PyObject *obj)
1142{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001143 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001144 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001145 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 Py_INCREF(obj);
1147 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001148 }
1149 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001150 /* For a Unicode subtype that's not a Unicode object,
1151 return a true Unicode object with the same data. */
1152 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1153 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001154 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001155 PyErr_Format(PyExc_TypeError,
1156 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001157 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001158 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001159}
1160
1161PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 const char *encoding,
1163 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164{
Antoine Pitroua2983c62010-09-01 15:16:41 +00001165 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001167
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001169 PyErr_BadInternalCall();
1170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172
Antoine Pitroua2983c62010-09-01 15:16:41 +00001173 /* Decoding bytes objects is the most common case and should be fast */
1174 if (PyBytes_Check(obj)) {
1175 if (PyBytes_GET_SIZE(obj) == 0) {
1176 Py_INCREF(unicode_empty);
1177 v = (PyObject *) unicode_empty;
1178 }
1179 else {
1180 v = PyUnicode_Decode(
1181 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1182 encoding, errors);
1183 }
1184 return v;
1185 }
1186
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001187 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001188 PyErr_SetString(PyExc_TypeError,
1189 "decoding str is not supported");
1190 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001192
Antoine Pitroua2983c62010-09-01 15:16:41 +00001193 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1194 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1195 PyErr_Format(PyExc_TypeError,
1196 "coercing to str: need bytes, bytearray "
1197 "or buffer-like object, %.80s found",
1198 Py_TYPE(obj)->tp_name);
1199 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201
Antoine Pitroua2983c62010-09-01 15:16:41 +00001202 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001203 Py_INCREF(unicode_empty);
Antoine Pitroua2983c62010-09-01 15:16:41 +00001204 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 }
Tim Petersced69f82003-09-16 20:30:58 +00001206 else
Antoine Pitroua2983c62010-09-01 15:16:41 +00001207 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001208
Antoine Pitroua2983c62010-09-01 15:16:41 +00001209 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001210 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211}
1212
1213PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001214 Py_ssize_t size,
1215 const char *encoding,
1216 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217{
1218 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001219 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001220 char lower[20]; /* Enough for any encoding name we recognize */
1221 char *l;
1222 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223
1224 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001225 encoding = PyUnicode_GetDefaultEncoding();
1226
1227 /* Convert encoding to lower case and replace '_' with '-' in order to
1228 catch e.g. UTF_8 */
1229 e = encoding;
1230 l = lower;
1231 while (*e && l < &lower[(sizeof lower) - 2]) {
1232 if (ISUPPER(*e)) {
1233 *l++ = TOLOWER(*e++);
1234 }
1235 else if (*e == '_') {
1236 *l++ = '-';
1237 e++;
1238 }
1239 else {
1240 *l++ = *e++;
1241 }
1242 }
1243 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001244
1245 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001246 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001248 else if ((strcmp(lower, "latin-1") == 0) ||
1249 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001250 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001251#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001252 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001253 return PyUnicode_DecodeMBCS(s, size, errors);
1254#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001255 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001256 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001257 else if (strcmp(lower, "utf-16") == 0)
1258 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1259 else if (strcmp(lower, "utf-32") == 0)
1260 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261
1262 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001263 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001264 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001265 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001266 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 if (buffer == NULL)
1268 goto onError;
1269 unicode = PyCodec_Decode(buffer, encoding, errors);
1270 if (unicode == NULL)
1271 goto onError;
1272 if (!PyUnicode_Check(unicode)) {
1273 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001274 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001275 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 goto onError;
1278 }
1279 Py_DECREF(buffer);
1280 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001281
Benjamin Peterson29060642009-01-31 22:14:21 +00001282 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 Py_XDECREF(buffer);
1284 return NULL;
1285}
1286
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001287PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v;
1292
1293 if (!PyUnicode_Check(unicode)) {
1294 PyErr_BadArgument();
1295 goto onError;
1296 }
1297
1298 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001299 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001300
1301 /* Decode via the codec registry */
1302 v = PyCodec_Decode(unicode, encoding, errors);
1303 if (v == NULL)
1304 goto onError;
1305 return v;
1306
Benjamin Peterson29060642009-01-31 22:14:21 +00001307 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001308 return NULL;
1309}
1310
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001311PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1312 const char *encoding,
1313 const char *errors)
1314{
1315 PyObject *v;
1316
1317 if (!PyUnicode_Check(unicode)) {
1318 PyErr_BadArgument();
1319 goto onError;
1320 }
1321
1322 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001323 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001324
1325 /* Decode via the codec registry */
1326 v = PyCodec_Decode(unicode, encoding, errors);
1327 if (v == NULL)
1328 goto onError;
1329 if (!PyUnicode_Check(v)) {
1330 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001331 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001332 Py_TYPE(v)->tp_name);
1333 Py_DECREF(v);
1334 goto onError;
1335 }
1336 return v;
1337
Benjamin Peterson29060642009-01-31 22:14:21 +00001338 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001339 return NULL;
1340}
1341
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001343 Py_ssize_t size,
1344 const char *encoding,
1345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346{
1347 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 unicode = PyUnicode_FromUnicode(s, size);
1350 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1353 Py_DECREF(unicode);
1354 return v;
1355}
1356
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001357PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1358 const char *encoding,
1359 const char *errors)
1360{
1361 PyObject *v;
1362
1363 if (!PyUnicode_Check(unicode)) {
1364 PyErr_BadArgument();
1365 goto onError;
1366 }
1367
1368 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001369 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001370
1371 /* Encode via the codec registry */
1372 v = PyCodec_Encode(unicode, encoding, errors);
1373 if (v == NULL)
1374 goto onError;
1375 return v;
1376
Benjamin Peterson29060642009-01-31 22:14:21 +00001377 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001378 return NULL;
1379}
1380
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1382 const char *encoding,
1383 const char *errors)
1384{
1385 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001386
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 if (!PyUnicode_Check(unicode)) {
1388 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
Fred Drakee4315f52000-05-09 19:53:39 +00001391
Tim Petersced69f82003-09-16 20:30:58 +00001392 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001394
1395 /* Shortcuts for common default encodings */
1396 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 if (strcmp(encoding, "utf-8") == 0)
1398 return PyUnicode_AsUTF8String(unicode);
1399 else if (strcmp(encoding, "latin-1") == 0)
1400 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001401#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 else if (strcmp(encoding, "mbcs") == 0)
1403 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001404#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 else if (strcmp(encoding, "ascii") == 0)
1406 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001407 /* During bootstrap, we may need to find the encodings
1408 package, to load the file system encoding, and require the
1409 file system encoding in order to load the encodings
1410 package.
1411
1412 Break out of this dependency by assuming that the path to
1413 the encodings module is ASCII-only. XXX could try wcstombs
1414 instead, if the file system encoding is the locale's
1415 encoding. */
1416 else if (Py_FileSystemDefaultEncoding &&
1417 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1418 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001419 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421
1422 /* Encode via the codec registry */
1423 v = PyCodec_Encode(unicode, encoding, errors);
1424 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001425 return NULL;
1426
1427 /* The normal path */
1428 if (PyBytes_Check(v))
1429 return v;
1430
1431 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001432 if (PyByteArray_Check(v)) {
1433 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001434 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001435 PyOS_snprintf(msg, sizeof(msg),
1436 "encoder %s returned buffer instead of bytes",
1437 encoding);
1438 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001439 Py_DECREF(v);
1440 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001441 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001442
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001443 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1444 Py_DECREF(v);
1445 return b;
1446 }
1447
1448 PyErr_Format(PyExc_TypeError,
1449 "encoder did not return a bytes object (type=%.400s)",
1450 Py_TYPE(v)->tp_name);
1451 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001452 return NULL;
1453}
1454
1455PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1456 const char *encoding,
1457 const char *errors)
1458{
1459 PyObject *v;
1460
1461 if (!PyUnicode_Check(unicode)) {
1462 PyErr_BadArgument();
1463 goto onError;
1464 }
1465
1466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001468
1469 /* Encode via the codec registry */
1470 v = PyCodec_Encode(unicode, encoding, errors);
1471 if (v == NULL)
1472 goto onError;
1473 if (!PyUnicode_Check(v)) {
1474 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001475 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001476 Py_TYPE(v)->tp_name);
1477 Py_DECREF(v);
1478 goto onError;
1479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483 return NULL;
1484}
1485
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001486PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001488{
1489 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001490 if (v)
1491 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001492 if (errors != NULL)
1493 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001494 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001495 PyUnicode_GET_SIZE(unicode),
1496 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001497 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001498 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001499 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001500 return v;
1501}
1502
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001503PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001504PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001505 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001506 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1507}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001508
Christian Heimes5894ba72007-11-04 11:43:14 +00001509PyObject*
1510PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1511{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001512 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1513 can be undefined. If it is case, decode using UTF-8. The following assumes
1514 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1515 bootstrapping process where the codecs aren't ready yet.
1516 */
1517 if (Py_FileSystemDefaultEncoding) {
1518#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001519 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001520 return PyUnicode_DecodeMBCS(s, size, "replace");
1521 }
1522#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001523 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001524 return PyUnicode_DecodeUTF8(s, size, "replace");
1525 }
1526#endif
1527 return PyUnicode_Decode(s, size,
1528 Py_FileSystemDefaultEncoding,
1529 "replace");
1530 }
1531 else {
1532 return PyUnicode_DecodeUTF8(s, size, "replace");
1533 }
1534}
1535
Martin v. Löwis011e8422009-05-05 04:43:17 +00001536/* Convert the argument to a bytes object, according to the file
1537 system encoding */
1538
1539int
1540PyUnicode_FSConverter(PyObject* arg, void* addr)
1541{
1542 PyObject *output = NULL;
1543 Py_ssize_t size;
1544 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001545 if (arg == NULL) {
1546 Py_DECREF(*(PyObject**)addr);
1547 return 1;
1548 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001549 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1550 output = arg;
1551 Py_INCREF(output);
1552 }
1553 else {
1554 arg = PyUnicode_FromObject(arg);
1555 if (!arg)
1556 return 0;
1557 output = PyUnicode_AsEncodedObject(arg,
1558 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001559 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001560 Py_DECREF(arg);
1561 if (!output)
1562 return 0;
1563 if (!PyBytes_Check(output)) {
1564 Py_DECREF(output);
1565 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1566 return 0;
1567 }
1568 }
1569 if (PyBytes_Check(output)) {
1570 size = PyBytes_GET_SIZE(output);
1571 data = PyBytes_AS_STRING(output);
1572 }
1573 else {
1574 size = PyByteArray_GET_SIZE(output);
1575 data = PyByteArray_AS_STRING(output);
1576 }
1577 if (size != strlen(data)) {
1578 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1579 Py_DECREF(output);
1580 return 0;
1581 }
1582 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001583 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001584}
1585
1586
Martin v. Löwis5b222132007-06-10 09:51:05 +00001587char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001588_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001589{
Christian Heimesf3863112007-11-22 07:46:41 +00001590 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001591 if (!PyUnicode_Check(unicode)) {
1592 PyErr_BadArgument();
1593 return NULL;
1594 }
Christian Heimesf3863112007-11-22 07:46:41 +00001595 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1596 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001597 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001598 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001599 *psize = PyBytes_GET_SIZE(bytes);
1600 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001601}
1602
1603char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001604_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001605{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001606 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001607}
1608
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1610{
1611 if (!PyUnicode_Check(unicode)) {
1612 PyErr_BadArgument();
1613 goto onError;
1614 }
1615 return PyUnicode_AS_UNICODE(unicode);
1616
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 return NULL;
1619}
1620
Martin v. Löwis18e16552006-02-15 17:27:45 +00001621Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622{
1623 if (!PyUnicode_Check(unicode)) {
1624 PyErr_BadArgument();
1625 goto onError;
1626 }
1627 return PyUnicode_GET_SIZE(unicode);
1628
Benjamin Peterson29060642009-01-31 22:14:21 +00001629 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 return -1;
1631}
1632
Thomas Wouters78890102000-07-22 19:25:51 +00001633const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001634{
1635 return unicode_default_encoding;
1636}
1637
1638int PyUnicode_SetDefaultEncoding(const char *encoding)
1639{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001640 if (strcmp(encoding, unicode_default_encoding) != 0) {
1641 PyErr_Format(PyExc_ValueError,
1642 "Can only set default encoding to %s",
1643 unicode_default_encoding);
1644 return -1;
1645 }
Fred Drakee4315f52000-05-09 19:53:39 +00001646 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001647}
1648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649/* error handling callback helper:
1650 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001651 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652 and adjust various state variables.
1653 return 0 on success, -1 on error
1654*/
1655
1656static
1657int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 const char *encoding, const char *reason,
1659 const char **input, const char **inend, Py_ssize_t *startinpos,
1660 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1661 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001662{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001663 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664
1665 PyObject *restuple = NULL;
1666 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001667 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001668 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001669 Py_ssize_t requiredsize;
1670 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001671 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001672 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001673 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 int res = -1;
1675
1676 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001677 *errorHandler = PyCodec_LookupError(errors);
1678 if (*errorHandler == NULL)
1679 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 }
1681
1682 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001683 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001684 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1685 if (*exceptionObject == NULL)
1686 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001687 }
1688 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001689 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1690 goto onError;
1691 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1692 goto onError;
1693 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001695 }
1696
1697 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1698 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001701 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001703 }
1704 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001705 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001706
1707 /* Copy back the bytes variables, which might have been modified by the
1708 callback */
1709 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1710 if (!inputobj)
1711 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001712 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001713 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001714 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001715 *input = PyBytes_AS_STRING(inputobj);
1716 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001717 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001718 /* we can DECREF safely, as the exception has another reference,
1719 so the object won't go away. */
1720 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001724 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001725 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1726 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001727 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728
1729 /* need more space? (at least enough for what we
1730 have+the replacement+the rest of the string (starting
1731 at the new input position), so we won't have to check space
1732 when there are no errors in the rest of the string) */
1733 repptr = PyUnicode_AS_UNICODE(repunicode);
1734 repsize = PyUnicode_GET_SIZE(repunicode);
1735 requiredsize = *outpos + repsize + insize-newpos;
1736 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001737 if (requiredsize<2*outsize)
1738 requiredsize = 2*outsize;
1739 if (_PyUnicode_Resize(output, requiredsize) < 0)
1740 goto onError;
1741 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 }
1743 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001744 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001745 Py_UNICODE_COPY(*outptr, repptr, repsize);
1746 *outptr += repsize;
1747 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001748
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001749 /* we made it! */
1750 res = 0;
1751
Benjamin Peterson29060642009-01-31 22:14:21 +00001752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001753 Py_XDECREF(restuple);
1754 return res;
1755}
1756
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001757/* --- UTF-7 Codec -------------------------------------------------------- */
1758
Antoine Pitrou244651a2009-05-04 18:56:13 +00001759/* See RFC2152 for details. We encode conservatively and decode liberally. */
1760
1761/* Three simple macros defining base-64. */
1762
1763/* Is c a base-64 character? */
1764
1765#define IS_BASE64(c) \
1766 (((c) >= 'A' && (c) <= 'Z') || \
1767 ((c) >= 'a' && (c) <= 'z') || \
1768 ((c) >= '0' && (c) <= '9') || \
1769 (c) == '+' || (c) == '/')
1770
1771/* given that c is a base-64 character, what is its base-64 value? */
1772
1773#define FROM_BASE64(c) \
1774 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1775 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1776 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1777 (c) == '+' ? 62 : 63)
1778
1779/* What is the base-64 character of the bottom 6 bits of n? */
1780
1781#define TO_BASE64(n) \
1782 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1783
1784/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1785 * decoded as itself. We are permissive on decoding; the only ASCII
1786 * byte not decoding to itself is the + which begins a base64
1787 * string. */
1788
1789#define DECODE_DIRECT(c) \
1790 ((c) <= 127 && (c) != '+')
1791
1792/* The UTF-7 encoder treats ASCII characters differently according to
1793 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1794 * the above). See RFC2152. This array identifies these different
1795 * sets:
1796 * 0 : "Set D"
1797 * alphanumeric and '(),-./:?
1798 * 1 : "Set O"
1799 * !"#$%&*;<=>@[]^_`{|}
1800 * 2 : "whitespace"
1801 * ht nl cr sp
1802 * 3 : special (must be base64 encoded)
1803 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1804 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001805
Tim Petersced69f82003-09-16 20:30:58 +00001806static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001807char utf7_category[128] = {
1808/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1809 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1810/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1811 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1812/* sp ! " # $ % & ' ( ) * + , - . / */
1813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1814/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1816/* @ A B C D E F G H I J K L M N O */
1817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1818/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1819 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1820/* ` a b c d e f g h i j k l m n o */
1821 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1822/* p q r s t u v w x y z { | } ~ del */
1823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001824};
1825
Antoine Pitrou244651a2009-05-04 18:56:13 +00001826/* ENCODE_DIRECT: this character should be encoded as itself. The
1827 * answer depends on whether we are encoding set O as itself, and also
1828 * on whether we are encoding whitespace as itself. RFC2152 makes it
1829 * clear that the answers to these questions vary between
1830 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001831
Antoine Pitrou244651a2009-05-04 18:56:13 +00001832#define ENCODE_DIRECT(c, directO, directWS) \
1833 ((c) < 128 && (c) > 0 && \
1834 ((utf7_category[(c)] == 0) || \
1835 (directWS && (utf7_category[(c)] == 2)) || \
1836 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001838PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001839 Py_ssize_t size,
1840 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001842 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1843}
1844
Antoine Pitrou244651a2009-05-04 18:56:13 +00001845/* The decoder. The only state we preserve is our read position,
1846 * i.e. how many characters we have consumed. So if we end in the
1847 * middle of a shift sequence we have to back off the read position
1848 * and the output to the beginning of the sequence, otherwise we lose
1849 * all the shift state (seen bits, number of bits seen, high
1850 * surrogate). */
1851
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001852PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001853 Py_ssize_t size,
1854 const char *errors,
1855 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001858 Py_ssize_t startinpos;
1859 Py_ssize_t endinpos;
1860 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001861 const char *e;
1862 PyUnicodeObject *unicode;
1863 Py_UNICODE *p;
1864 const char *errmsg = "";
1865 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001866 Py_UNICODE *shiftOutStart;
1867 unsigned int base64bits = 0;
1868 unsigned long base64buffer = 0;
1869 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870 PyObject *errorHandler = NULL;
1871 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872
1873 unicode = _PyUnicode_New(size);
1874 if (!unicode)
1875 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001876 if (size == 0) {
1877 if (consumed)
1878 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001880 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881
1882 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001883 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001884 e = s + size;
1885
1886 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001887 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001888 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001889 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001890
Antoine Pitrou244651a2009-05-04 18:56:13 +00001891 if (inShift) { /* in a base-64 section */
1892 if (IS_BASE64(ch)) { /* consume a base-64 character */
1893 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1894 base64bits += 6;
1895 s++;
1896 if (base64bits >= 16) {
1897 /* we have enough bits for a UTF-16 value */
1898 Py_UNICODE outCh = (Py_UNICODE)
1899 (base64buffer >> (base64bits-16));
1900 base64bits -= 16;
1901 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1902 if (surrogate) {
1903 /* expecting a second surrogate */
1904 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1905#ifdef Py_UNICODE_WIDE
1906 *p++ = (((surrogate & 0x3FF)<<10)
1907 | (outCh & 0x3FF)) + 0x10000;
1908#else
1909 *p++ = surrogate;
1910 *p++ = outCh;
1911#endif
1912 surrogate = 0;
1913 }
1914 else {
1915 surrogate = 0;
1916 errmsg = "second surrogate missing";
1917 goto utf7Error;
1918 }
1919 }
1920 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1921 /* first surrogate */
1922 surrogate = outCh;
1923 }
1924 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1925 errmsg = "unexpected second surrogate";
1926 goto utf7Error;
1927 }
1928 else {
1929 *p++ = outCh;
1930 }
1931 }
1932 }
1933 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001934 inShift = 0;
1935 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001936 if (surrogate) {
1937 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001938 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001939 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001940 if (base64bits > 0) { /* left-over bits */
1941 if (base64bits >= 6) {
1942 /* We've seen at least one base-64 character */
1943 errmsg = "partial character in shift sequence";
1944 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001945 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001946 else {
1947 /* Some bits remain; they should be zero */
1948 if (base64buffer != 0) {
1949 errmsg = "non-zero padding bits in shift sequence";
1950 goto utf7Error;
1951 }
1952 }
1953 }
1954 if (ch != '-') {
1955 /* '-' is absorbed; other terminating
1956 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 *p++ = ch;
1958 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001959 }
1960 }
1961 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001963 s++; /* consume '+' */
1964 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001965 s++;
1966 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001967 }
1968 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001969 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001970 shiftOutStart = p;
1971 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972 }
1973 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001974 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001975 *p++ = ch;
1976 s++;
1977 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001978 else {
1979 startinpos = s-starts;
1980 s++;
1981 errmsg = "unexpected special character";
1982 goto utf7Error;
1983 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001984 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001985utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986 outpos = p-PyUnicode_AS_UNICODE(unicode);
1987 endinpos = s-starts;
1988 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001989 errors, &errorHandler,
1990 "utf7", errmsg,
1991 &starts, &e, &startinpos, &endinpos, &exc, &s,
1992 &unicode, &outpos, &p))
1993 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001994 }
1995
Antoine Pitrou244651a2009-05-04 18:56:13 +00001996 /* end of string */
1997
1998 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1999 /* if we're in an inconsistent state, that's an error */
2000 if (surrogate ||
2001 (base64bits >= 6) ||
2002 (base64bits > 0 && base64buffer != 0)) {
2003 outpos = p-PyUnicode_AS_UNICODE(unicode);
2004 endinpos = size;
2005 if (unicode_decode_call_errorhandler(
2006 errors, &errorHandler,
2007 "utf7", "unterminated shift sequence",
2008 &starts, &e, &startinpos, &endinpos, &exc, &s,
2009 &unicode, &outpos, &p))
2010 goto onError;
2011 if (s < e)
2012 goto restart;
2013 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002014 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015
2016 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002017 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018 if (inShift) {
2019 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002020 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002021 }
2022 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002023 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002025 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002026
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002027 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002028 goto onError;
2029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 Py_XDECREF(errorHandler);
2031 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002032 return (PyObject *)unicode;
2033
Benjamin Peterson29060642009-01-31 22:14:21 +00002034 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 Py_XDECREF(errorHandler);
2036 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037 Py_DECREF(unicode);
2038 return NULL;
2039}
2040
2041
2042PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002044 int base64SetO,
2045 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002046 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002047{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002048 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 /* It might be possible to tighten this worst case */
Georg Brandl194da4a2009-08-13 09:34:05 +00002050 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002051 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002052 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002053 unsigned int base64bits = 0;
2054 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002055 char * out;
2056 char * start;
2057
2058 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060
Georg Brandl194da4a2009-08-13 09:34:05 +00002061 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002062 return PyErr_NoMemory();
2063
Antoine Pitrou244651a2009-05-04 18:56:13 +00002064 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002065 if (v == NULL)
2066 return NULL;
2067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002068 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002069 for (;i < size; ++i) {
2070 Py_UNICODE ch = s[i];
2071
Antoine Pitrou244651a2009-05-04 18:56:13 +00002072 if (inShift) {
2073 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2074 /* shifting out */
2075 if (base64bits) { /* output remaining bits */
2076 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2077 base64buffer = 0;
2078 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002079 }
2080 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002081 /* Characters not in the BASE64 set implicitly unshift the sequence
2082 so no '-' is required, except if the character is itself a '-' */
2083 if (IS_BASE64(ch) || ch == '-') {
2084 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002085 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002086 *out++ = (char) ch;
2087 }
2088 else {
2089 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002090 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002091 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002092 else { /* not in a shift sequence */
2093 if (ch == '+') {
2094 *out++ = '+';
2095 *out++ = '-';
2096 }
2097 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2098 *out++ = (char) ch;
2099 }
2100 else {
2101 *out++ = '+';
2102 inShift = 1;
2103 goto encode_char;
2104 }
2105 }
2106 continue;
2107encode_char:
2108#ifdef Py_UNICODE_WIDE
2109 if (ch >= 0x10000) {
2110 /* code first surrogate */
2111 base64bits += 16;
2112 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2113 while (base64bits >= 6) {
2114 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2115 base64bits -= 6;
2116 }
2117 /* prepare second surrogate */
2118 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2119 }
2120#endif
2121 base64bits += 16;
2122 base64buffer = (base64buffer << 16) | ch;
2123 while (base64bits >= 6) {
2124 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2125 base64bits -= 6;
2126 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002127 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128 if (base64bits)
2129 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2130 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002132 if (_PyBytes_Resize(&v, out - start) < 0)
2133 return NULL;
2134 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135}
2136
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137#undef IS_BASE64
2138#undef FROM_BASE64
2139#undef TO_BASE64
2140#undef DECODE_DIRECT
2141#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143/* --- UTF-8 Codec -------------------------------------------------------- */
2144
Tim Petersced69f82003-09-16 20:30:58 +00002145static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146char utf8_code_length[256] = {
Ezio Melotti25bc0192010-07-03 05:18:50 +00002147 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2148 illegal prefix. See RFC 3629 for details */
2149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti25bc0192010-07-03 05:18:50 +00002156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti25bc0192010-07-03 05:18:50 +00002160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2161 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2162 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2163 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2164 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165};
2166
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002168 Py_ssize_t size,
2169 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170{
Walter Dörwald69652032004-09-07 20:24:22 +00002171 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2172}
2173
Antoine Pitrouab868312009-01-10 15:40:25 +00002174/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2175#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2176
2177/* Mask to quickly check whether a C 'long' contains a
2178 non-ASCII, UTF8-encoded char. */
2179#if (SIZEOF_LONG == 8)
2180# define ASCII_CHAR_MASK 0x8080808080808080L
2181#elif (SIZEOF_LONG == 4)
2182# define ASCII_CHAR_MASK 0x80808080L
2183#else
2184# error C 'long' size should be either 4 or 8!
2185#endif
2186
Walter Dörwald69652032004-09-07 20:24:22 +00002187PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 Py_ssize_t size,
2189 const char *errors,
2190 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002191{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002192 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 int n;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002194 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002195 Py_ssize_t startinpos;
2196 Py_ssize_t endinpos;
2197 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002198 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 PyUnicodeObject *unicode;
2200 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002201 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 PyObject *errorHandler = NULL;
2203 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
2205 /* Note: size will always be longer than the resulting Unicode
2206 character count */
2207 unicode = _PyUnicode_New(size);
2208 if (!unicode)
2209 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002210 if (size == 0) {
2211 if (consumed)
2212 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002214 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215
2216 /* Unpack UTF-8 encoded data */
2217 p = unicode->str;
2218 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002219 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220
2221 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002222 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223
2224 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002225 /* Fast path for runs of ASCII characters. Given that common UTF-8
2226 input will consist of an overwhelming majority of ASCII
2227 characters, we try to optimize for this case by checking
2228 as many characters as a C 'long' can contain.
2229 First, check if we can do an aligned read, as most CPUs have
2230 a penalty for unaligned reads.
2231 */
2232 if (!((size_t) s & LONG_PTR_MASK)) {
2233 /* Help register allocation */
2234 register const char *_s = s;
2235 register Py_UNICODE *_p = p;
2236 while (_s < aligned_end) {
2237 /* Read a whole long at a time (either 4 or 8 bytes),
2238 and do a fast unrolled copy if it only contains ASCII
2239 characters. */
2240 unsigned long data = *(unsigned long *) _s;
2241 if (data & ASCII_CHAR_MASK)
2242 break;
2243 _p[0] = (unsigned char) _s[0];
2244 _p[1] = (unsigned char) _s[1];
2245 _p[2] = (unsigned char) _s[2];
2246 _p[3] = (unsigned char) _s[3];
2247#if (SIZEOF_LONG == 8)
2248 _p[4] = (unsigned char) _s[4];
2249 _p[5] = (unsigned char) _s[5];
2250 _p[6] = (unsigned char) _s[6];
2251 _p[7] = (unsigned char) _s[7];
2252#endif
2253 _s += SIZEOF_LONG;
2254 _p += SIZEOF_LONG;
2255 }
2256 s = _s;
2257 p = _p;
2258 if (s == e)
2259 break;
2260 ch = (unsigned char)*s;
2261 }
2262 }
2263
2264 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002265 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 s++;
2267 continue;
2268 }
2269
2270 n = utf8_code_length[ch];
2271
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002272 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002273 if (consumed)
2274 break;
2275 else {
2276 errmsg = "unexpected end of data";
2277 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002278 endinpos = startinpos+1;
2279 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2280 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 goto utf8Error;
2282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 switch (n) {
2286
2287 case 0:
Ezio Melotti25bc0192010-07-03 05:18:50 +00002288 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 startinpos = s-starts;
2290 endinpos = startinpos+1;
2291 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002294 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002295 startinpos = s-starts;
2296 endinpos = startinpos+1;
2297 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002300 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti25bc0192010-07-03 05:18:50 +00002301 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002303 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002304 goto utf8Error;
2305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti25bc0192010-07-03 05:18:50 +00002307 assert ((ch > 0x007F) && (ch <= 0x07FF));
2308 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 break;
2310
2311 case 3:
Ezio Melotti25bc0192010-07-03 05:18:50 +00002312 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2313 will result in surrogates in range d800-dfff. Surrogates are
2314 not valid UTF-8 so they are rejected.
2315 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2316 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002317 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti25bc0192010-07-03 05:18:50 +00002318 (s[2] & 0xc0) != 0x80 ||
2319 ((unsigned char)s[0] == 0xE0 &&
2320 (unsigned char)s[1] < 0xA0) ||
2321 ((unsigned char)s[0] == 0xED &&
2322 (unsigned char)s[1] > 0x9F)) {
2323 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002324 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002325 endinpos = startinpos + 1;
2326
2327 /* if s[1] first two bits are 1 and 0, then the invalid
2328 continuation byte is s[2], so increment endinpos by 1,
2329 if not, s[1] is invalid and endinpos doesn't need to
2330 be incremented. */
2331 if ((s[1] & 0xC0) == 0x80)
2332 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002333 goto utf8Error;
2334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti25bc0192010-07-03 05:18:50 +00002336 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2337 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002338 break;
2339
2340 case 4:
2341 if ((s[1] & 0xc0) != 0x80 ||
2342 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti25bc0192010-07-03 05:18:50 +00002343 (s[3] & 0xc0) != 0x80 ||
2344 ((unsigned char)s[0] == 0xF0 &&
2345 (unsigned char)s[1] < 0x90) ||
2346 ((unsigned char)s[0] == 0xF4 &&
2347 (unsigned char)s[1] > 0x8F)) {
2348 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002349 startinpos = s-starts;
Ezio Melotti25bc0192010-07-03 05:18:50 +00002350 endinpos = startinpos + 1;
2351 if ((s[1] & 0xC0) == 0x80) {
2352 endinpos++;
2353 if ((s[2] & 0xC0) == 0x80)
2354 endinpos++;
2355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 goto utf8Error;
2357 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002358 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti25bc0192010-07-03 05:18:50 +00002359 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2360 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2361
Fredrik Lundh8f455852001-06-27 18:59:43 +00002362#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002363 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002364#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002365 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002366
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002367 /* translate from 10000..10FFFF to 0..FFFF */
2368 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002369
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002370 /* high surrogate = top 10 bits added to D800 */
2371 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002372
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002373 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002374 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002375#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 }
2378 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002379 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002380
Benjamin Peterson29060642009-01-31 22:14:21 +00002381 utf8Error:
2382 outpos = p-PyUnicode_AS_UNICODE(unicode);
2383 if (unicode_decode_call_errorhandler(
2384 errors, &errorHandler,
2385 "utf8", errmsg,
2386 &starts, &e, &startinpos, &endinpos, &exc, &s,
2387 &unicode, &outpos, &p))
2388 goto onError;
2389 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 }
Walter Dörwald69652032004-09-07 20:24:22 +00002391 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002392 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393
2394 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002395 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 goto onError;
2397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 Py_XDECREF(errorHandler);
2399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 return (PyObject *)unicode;
2401
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 Py_XDECREF(errorHandler);
2404 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002405 Py_DECREF(unicode);
2406 return NULL;
2407}
2408
Antoine Pitrouab868312009-01-10 15:40:25 +00002409#undef ASCII_CHAR_MASK
2410
2411
Tim Peters602f7402002-04-27 18:03:26 +00002412/* Allocation strategy: if the string is short, convert into a stack buffer
2413 and allocate exactly as much space needed at the end. Else allocate the
2414 maximum possible needed (4 result bytes per Unicode character), and return
2415 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002416*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002417PyObject *
2418PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 Py_ssize_t size,
2420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421{
Tim Peters602f7402002-04-27 18:03:26 +00002422#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002423
Guido van Rossum98297ee2007-11-06 21:34:58 +00002424 Py_ssize_t i; /* index into s of next input byte */
2425 PyObject *result; /* result string object */
2426 char *p; /* next free byte in output buffer */
2427 Py_ssize_t nallocated; /* number of result bytes allocated */
2428 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002429 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002430 PyObject *errorHandler = NULL;
2431 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002432
Tim Peters602f7402002-04-27 18:03:26 +00002433 assert(s != NULL);
2434 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435
Tim Peters602f7402002-04-27 18:03:26 +00002436 if (size <= MAX_SHORT_UNICHARS) {
2437 /* Write into the stack buffer; nallocated can't overflow.
2438 * At the end, we'll allocate exactly as much heap space as it
2439 * turns out we need.
2440 */
2441 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002442 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002443 p = stackbuf;
2444 }
2445 else {
2446 /* Overallocate on the heap, and give the excess back at the end. */
2447 nallocated = size * 4;
2448 if (nallocated / 4 != size) /* overflow! */
2449 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002450 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002451 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002452 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002453 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002454 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002455
Tim Peters602f7402002-04-27 18:03:26 +00002456 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002457 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002458
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002459 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002460 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002462
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002464 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002465 *p++ = (char)(0xc0 | (ch >> 6));
2466 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner158701d2010-04-22 19:41:01 +00002467 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002468#ifndef Py_UNICODE_WIDE
Victor Stinner158701d2010-04-22 19:41:01 +00002469 /* Special case: check for high and low surrogate */
2470 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2471 Py_UCS4 ch2 = s[i];
2472 /* Combine the two surrogates to form a UCS4 value */
2473 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2474 i++;
2475
2476 /* Encode UCS4 Unicode ordinals */
2477 *p++ = (char)(0xf0 | (ch >> 18));
2478 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2480 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner158701d2010-04-22 19:41:01 +00002481 } else {
Victor Stinner0b79b762010-04-22 20:07:28 +00002482#endif
Victor Stinner158701d2010-04-22 19:41:01 +00002483 Py_ssize_t newpos;
2484 PyObject *rep;
2485 Py_ssize_t repsize, k;
2486 rep = unicode_encode_call_errorhandler
2487 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2488 s, size, &exc, i-1, i, &newpos);
2489 if (!rep)
2490 goto error;
2491
2492 if (PyBytes_Check(rep))
2493 repsize = PyBytes_GET_SIZE(rep);
2494 else
2495 repsize = PyUnicode_GET_SIZE(rep);
2496
2497 if (repsize > 4) {
2498 Py_ssize_t offset;
2499
2500 if (result == NULL)
2501 offset = p - stackbuf;
2502 else
2503 offset = p - PyBytes_AS_STRING(result);
2504
2505 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2506 /* integer overflow */
2507 PyErr_NoMemory();
2508 goto error;
2509 }
2510 nallocated += repsize - 4;
2511 if (result != NULL) {
2512 if (_PyBytes_Resize(&result, nallocated) < 0)
2513 goto error;
2514 } else {
2515 result = PyBytes_FromStringAndSize(NULL, nallocated);
2516 if (result == NULL)
2517 goto error;
2518 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2519 }
2520 p = PyBytes_AS_STRING(result) + offset;
2521 }
2522
2523 if (PyBytes_Check(rep)) {
2524 char *prep = PyBytes_AS_STRING(rep);
2525 for(k = repsize; k > 0; k--)
2526 *p++ = *prep++;
2527 } else /* rep is unicode */ {
2528 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2529 Py_UNICODE c;
2530
2531 for(k=0; k<repsize; k++) {
2532 c = prep[k];
2533 if (0x80 <= c) {
2534 raise_encode_exception(&exc, "utf-8", s, size,
2535 i-1, i, "surrogates not allowed");
2536 goto error;
2537 }
2538 *p++ = (char)prep[k];
2539 }
2540 }
2541 Py_DECREF(rep);
Victor Stinner0b79b762010-04-22 20:07:28 +00002542#ifndef Py_UNICODE_WIDE
Victor Stinner158701d2010-04-22 19:41:01 +00002543 }
Victor Stinner0b79b762010-04-22 20:07:28 +00002544#endif
Victor Stinner158701d2010-04-22 19:41:01 +00002545 } else if (ch < 0x10000) {
2546 *p++ = (char)(0xe0 | (ch >> 12));
2547 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2548 *p++ = (char)(0x80 | (ch & 0x3f));
2549 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002550 /* Encode UCS4 Unicode ordinals */
2551 *p++ = (char)(0xf0 | (ch >> 18));
2552 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2553 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2554 *p++ = (char)(0x80 | (ch & 0x3f));
2555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002557
Guido van Rossum98297ee2007-11-06 21:34:58 +00002558 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002559 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002560 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002561 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002562 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002563 }
2564 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002565 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002566 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002567 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002568 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002569 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002570 Py_XDECREF(errorHandler);
2571 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002572 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002573 error:
2574 Py_XDECREF(errorHandler);
2575 Py_XDECREF(exc);
2576 Py_XDECREF(result);
2577 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002578
Tim Peters602f7402002-04-27 18:03:26 +00002579#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580}
2581
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2583{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 if (!PyUnicode_Check(unicode)) {
2585 PyErr_BadArgument();
2586 return NULL;
2587 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002588 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002589 PyUnicode_GET_SIZE(unicode),
2590 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591}
2592
Walter Dörwald41980ca2007-08-16 21:55:45 +00002593/* --- UTF-32 Codec ------------------------------------------------------- */
2594
2595PyObject *
2596PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 Py_ssize_t size,
2598 const char *errors,
2599 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002600{
2601 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2602}
2603
2604PyObject *
2605PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 Py_ssize_t size,
2607 const char *errors,
2608 int *byteorder,
2609 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002610{
2611 const char *starts = s;
2612 Py_ssize_t startinpos;
2613 Py_ssize_t endinpos;
2614 Py_ssize_t outpos;
2615 PyUnicodeObject *unicode;
2616 Py_UNICODE *p;
2617#ifndef Py_UNICODE_WIDE
Antoine Pitrou6107a682010-06-11 21:48:34 +00002618 int pairs = 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002619#else
2620 const int pairs = 0;
2621#endif
Antoine Pitrou6107a682010-06-11 21:48:34 +00002622 const unsigned char *q, *e, *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002623 int bo = 0; /* assume native ordering by default */
2624 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002625 /* Offsets from q for retrieving bytes in the right order. */
2626#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2627 int iorder[] = {0, 1, 2, 3};
2628#else
2629 int iorder[] = {3, 2, 1, 0};
2630#endif
2631 PyObject *errorHandler = NULL;
2632 PyObject *exc = NULL;
Antoine Pitrou6107a682010-06-11 21:48:34 +00002633
Walter Dörwald41980ca2007-08-16 21:55:45 +00002634 q = (unsigned char *)s;
2635 e = q + size;
2636
2637 if (byteorder)
2638 bo = *byteorder;
2639
2640 /* Check for BOM marks (U+FEFF) in the input and adjust current
2641 byte order setting accordingly. In native mode, the leading BOM
2642 mark is skipped, in all other modes, it is copied to the output
2643 stream as-is (giving a ZWNBSP character). */
2644 if (bo == 0) {
2645 if (size >= 4) {
2646 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002647 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002648#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002649 if (bom == 0x0000FEFF) {
2650 q += 4;
2651 bo = -1;
2652 }
2653 else if (bom == 0xFFFE0000) {
2654 q += 4;
2655 bo = 1;
2656 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002657#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002658 if (bom == 0x0000FEFF) {
2659 q += 4;
2660 bo = 1;
2661 }
2662 else if (bom == 0xFFFE0000) {
2663 q += 4;
2664 bo = -1;
2665 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002666#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002667 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002668 }
2669
2670 if (bo == -1) {
2671 /* force LE */
2672 iorder[0] = 0;
2673 iorder[1] = 1;
2674 iorder[2] = 2;
2675 iorder[3] = 3;
2676 }
2677 else if (bo == 1) {
2678 /* force BE */
2679 iorder[0] = 3;
2680 iorder[1] = 2;
2681 iorder[2] = 1;
2682 iorder[3] = 0;
2683 }
2684
Antoine Pitrou6107a682010-06-11 21:48:34 +00002685 /* On narrow builds we split characters outside the BMP into two
2686 codepoints => count how much extra space we need. */
2687#ifndef Py_UNICODE_WIDE
2688 for (qq = q; qq < e; qq += 4)
2689 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2690 pairs++;
2691#endif
2692
2693 /* This might be one to much, because of a BOM */
2694 unicode = _PyUnicode_New((size+3)/4+pairs);
2695 if (!unicode)
2696 return NULL;
2697 if (size == 0)
2698 return (PyObject *)unicode;
2699
2700 /* Unpack UTF-32 encoded data */
2701 p = unicode->str;
2702
Walter Dörwald41980ca2007-08-16 21:55:45 +00002703 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002704 Py_UCS4 ch;
2705 /* remaining bytes at the end? (size should be divisible by 4) */
2706 if (e-q<4) {
2707 if (consumed)
2708 break;
2709 errmsg = "truncated data";
2710 startinpos = ((const char *)q)-starts;
2711 endinpos = ((const char *)e)-starts;
2712 goto utf32Error;
2713 /* The remaining input chars are ignored if the callback
2714 chooses to skip the input */
2715 }
2716 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2717 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002718
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 if (ch >= 0x110000)
2720 {
2721 errmsg = "codepoint not in range(0x110000)";
2722 startinpos = ((const char *)q)-starts;
2723 endinpos = startinpos+4;
2724 goto utf32Error;
2725 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002726#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002727 if (ch >= 0x10000)
2728 {
2729 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2730 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2731 }
2732 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002733#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002734 *p++ = ch;
2735 q += 4;
2736 continue;
2737 utf32Error:
2738 outpos = p-PyUnicode_AS_UNICODE(unicode);
2739 if (unicode_decode_call_errorhandler(
2740 errors, &errorHandler,
2741 "utf32", errmsg,
2742 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2743 &unicode, &outpos, &p))
2744 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002745 }
2746
2747 if (byteorder)
2748 *byteorder = bo;
2749
2750 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002751 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002752
2753 /* Adjust length */
2754 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2755 goto onError;
2756
2757 Py_XDECREF(errorHandler);
2758 Py_XDECREF(exc);
2759 return (PyObject *)unicode;
2760
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002762 Py_DECREF(unicode);
2763 Py_XDECREF(errorHandler);
2764 Py_XDECREF(exc);
2765 return NULL;
2766}
2767
2768PyObject *
2769PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 Py_ssize_t size,
2771 const char *errors,
2772 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002773{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002774 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002775 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002776 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002777#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002778 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002779#else
2780 const int pairs = 0;
2781#endif
2782 /* Offsets from p for storing byte pairs in the right order. */
2783#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2784 int iorder[] = {0, 1, 2, 3};
2785#else
2786 int iorder[] = {3, 2, 1, 0};
2787#endif
2788
Benjamin Peterson29060642009-01-31 22:14:21 +00002789#define STORECHAR(CH) \
2790 do { \
2791 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2792 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2793 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2794 p[iorder[0]] = (CH) & 0xff; \
2795 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002796 } while(0)
2797
2798 /* In narrow builds we can output surrogate pairs as one codepoint,
2799 so we need less space. */
2800#ifndef Py_UNICODE_WIDE
2801 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002802 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2803 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2804 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002805#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002806 nsize = (size - pairs + (byteorder == 0));
2807 bytesize = nsize * 4;
2808 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002810 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002811 if (v == NULL)
2812 return NULL;
2813
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002814 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002815 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002816 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002817 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002818 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002819
2820 if (byteorder == -1) {
2821 /* force LE */
2822 iorder[0] = 0;
2823 iorder[1] = 1;
2824 iorder[2] = 2;
2825 iorder[3] = 3;
2826 }
2827 else if (byteorder == 1) {
2828 /* force BE */
2829 iorder[0] = 3;
2830 iorder[1] = 2;
2831 iorder[2] = 1;
2832 iorder[3] = 0;
2833 }
2834
2835 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002837#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2839 Py_UCS4 ch2 = *s;
2840 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2841 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2842 s++;
2843 size--;
2844 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002845 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002846#endif
2847 STORECHAR(ch);
2848 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002849
2850 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002851 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002852#undef STORECHAR
2853}
2854
2855PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2856{
2857 if (!PyUnicode_Check(unicode)) {
2858 PyErr_BadArgument();
2859 return NULL;
2860 }
2861 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 PyUnicode_GET_SIZE(unicode),
2863 NULL,
2864 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002865}
2866
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867/* --- UTF-16 Codec ------------------------------------------------------- */
2868
Tim Peters772747b2001-08-09 22:21:55 +00002869PyObject *
2870PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002871 Py_ssize_t size,
2872 const char *errors,
2873 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874{
Walter Dörwald69652032004-09-07 20:24:22 +00002875 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2876}
2877
Antoine Pitrouab868312009-01-10 15:40:25 +00002878/* Two masks for fast checking of whether a C 'long' may contain
2879 UTF16-encoded surrogate characters. This is an efficient heuristic,
2880 assuming that non-surrogate characters with a code point >= 0x8000 are
2881 rare in most input.
2882 FAST_CHAR_MASK is used when the input is in native byte ordering,
2883 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002884*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002885#if (SIZEOF_LONG == 8)
2886# define FAST_CHAR_MASK 0x8000800080008000L
2887# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2888#elif (SIZEOF_LONG == 4)
2889# define FAST_CHAR_MASK 0x80008000L
2890# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2891#else
2892# error C 'long' size should be either 4 or 8!
2893#endif
2894
Walter Dörwald69652032004-09-07 20:24:22 +00002895PyObject *
2896PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 Py_ssize_t size,
2898 const char *errors,
2899 int *byteorder,
2900 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002901{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002902 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002903 Py_ssize_t startinpos;
2904 Py_ssize_t endinpos;
2905 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 PyUnicodeObject *unicode;
2907 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002908 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002909 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002910 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002911 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002912 /* Offsets from q for retrieving byte pairs in the right order. */
2913#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2914 int ihi = 1, ilo = 0;
2915#else
2916 int ihi = 0, ilo = 1;
2917#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 PyObject *errorHandler = NULL;
2919 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920
2921 /* Note: size will always be longer than the resulting Unicode
2922 character count */
2923 unicode = _PyUnicode_New(size);
2924 if (!unicode)
2925 return NULL;
2926 if (size == 0)
2927 return (PyObject *)unicode;
2928
2929 /* Unpack UTF-16 encoded data */
2930 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002931 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002932 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933
2934 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002935 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002937 /* Check for BOM marks (U+FEFF) in the input and adjust current
2938 byte order setting accordingly. In native mode, the leading BOM
2939 mark is skipped, in all other modes, it is copied to the output
2940 stream as-is (giving a ZWNBSP character). */
2941 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002942 if (size >= 2) {
2943 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002944#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002945 if (bom == 0xFEFF) {
2946 q += 2;
2947 bo = -1;
2948 }
2949 else if (bom == 0xFFFE) {
2950 q += 2;
2951 bo = 1;
2952 }
Tim Petersced69f82003-09-16 20:30:58 +00002953#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 if (bom == 0xFEFF) {
2955 q += 2;
2956 bo = 1;
2957 }
2958 else if (bom == 0xFFFE) {
2959 q += 2;
2960 bo = -1;
2961 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002962#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002963 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965
Tim Peters772747b2001-08-09 22:21:55 +00002966 if (bo == -1) {
2967 /* force LE */
2968 ihi = 1;
2969 ilo = 0;
2970 }
2971 else if (bo == 1) {
2972 /* force BE */
2973 ihi = 0;
2974 ilo = 1;
2975 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002976#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2977 native_ordering = ilo < ihi;
2978#else
2979 native_ordering = ilo > ihi;
2980#endif
Tim Peters772747b2001-08-09 22:21:55 +00002981
Antoine Pitrouab868312009-01-10 15:40:25 +00002982 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002983 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002985 /* First check for possible aligned read of a C 'long'. Unaligned
2986 reads are more expensive, better to defer to another iteration. */
2987 if (!((size_t) q & LONG_PTR_MASK)) {
2988 /* Fast path for runs of non-surrogate chars. */
2989 register const unsigned char *_q = q;
2990 Py_UNICODE *_p = p;
2991 if (native_ordering) {
2992 /* Native ordering is simple: as long as the input cannot
2993 possibly contain a surrogate char, do an unrolled copy
2994 of several 16-bit code points to the target object.
2995 The non-surrogate check is done on several input bytes
2996 at a time (as many as a C 'long' can contain). */
2997 while (_q < aligned_end) {
2998 unsigned long data = * (unsigned long *) _q;
2999 if (data & FAST_CHAR_MASK)
3000 break;
3001 _p[0] = ((unsigned short *) _q)[0];
3002 _p[1] = ((unsigned short *) _q)[1];
3003#if (SIZEOF_LONG == 8)
3004 _p[2] = ((unsigned short *) _q)[2];
3005 _p[3] = ((unsigned short *) _q)[3];
3006#endif
3007 _q += SIZEOF_LONG;
3008 _p += SIZEOF_LONG / 2;
3009 }
3010 }
3011 else {
3012 /* Byteswapped ordering is similar, but we must decompose
3013 the copy bytewise, and take care of zero'ing out the
3014 upper bytes if the target object is in 32-bit units
3015 (that is, in UCS-4 builds). */
3016 while (_q < aligned_end) {
3017 unsigned long data = * (unsigned long *) _q;
3018 if (data & SWAPPED_FAST_CHAR_MASK)
3019 break;
3020 /* Zero upper bytes in UCS-4 builds */
3021#if (Py_UNICODE_SIZE > 2)
3022 _p[0] = 0;
3023 _p[1] = 0;
3024#if (SIZEOF_LONG == 8)
3025 _p[2] = 0;
3026 _p[3] = 0;
3027#endif
3028#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003029 /* Issue #4916; UCS-4 builds on big endian machines must
3030 fill the two last bytes of each 4-byte unit. */
3031#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3032# define OFF 2
3033#else
3034# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003035#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003036 ((unsigned char *) _p)[OFF + 1] = _q[0];
3037 ((unsigned char *) _p)[OFF + 0] = _q[1];
3038 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3039 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3040#if (SIZEOF_LONG == 8)
3041 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3042 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3043 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3044 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3045#endif
3046#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003047 _q += SIZEOF_LONG;
3048 _p += SIZEOF_LONG / 2;
3049 }
3050 }
3051 p = _p;
3052 q = _q;
3053 if (q >= e)
3054 break;
3055 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003057
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003059
3060 if (ch < 0xD800 || ch > 0xDFFF) {
3061 *p++ = ch;
3062 continue;
3063 }
3064
3065 /* UTF-16 code pair: */
3066 if (q > e) {
3067 errmsg = "unexpected end of data";
3068 startinpos = (((const char *)q) - 2) - starts;
3069 endinpos = ((const char *)e) + 1 - starts;
3070 goto utf16Error;
3071 }
3072 if (0xD800 <= ch && ch <= 0xDBFF) {
3073 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3074 q += 2;
3075 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003076#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 *p++ = ch;
3078 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003079#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003081#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 continue;
3083 }
3084 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003085 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 startinpos = (((const char *)q)-4)-starts;
3087 endinpos = startinpos+2;
3088 goto utf16Error;
3089 }
3090
Benjamin Peterson14339b62009-01-31 16:36:08 +00003091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 errmsg = "illegal encoding";
3093 startinpos = (((const char *)q)-2)-starts;
3094 endinpos = startinpos+2;
3095 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003096
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 utf16Error:
3098 outpos = p - PyUnicode_AS_UNICODE(unicode);
3099 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003100 errors,
3101 &errorHandler,
3102 "utf16", errmsg,
3103 &starts,
3104 (const char **)&e,
3105 &startinpos,
3106 &endinpos,
3107 &exc,
3108 (const char **)&q,
3109 &unicode,
3110 &outpos,
3111 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003114 /* remaining byte at the end? (size should be even) */
3115 if (e == q) {
3116 if (!consumed) {
3117 errmsg = "truncated data";
3118 startinpos = ((const char *)q) - starts;
3119 endinpos = ((const char *)e) + 1 - starts;
3120 outpos = p - PyUnicode_AS_UNICODE(unicode);
3121 if (unicode_decode_call_errorhandler(
3122 errors,
3123 &errorHandler,
3124 "utf16", errmsg,
3125 &starts,
3126 (const char **)&e,
3127 &startinpos,
3128 &endinpos,
3129 &exc,
3130 (const char **)&q,
3131 &unicode,
3132 &outpos,
3133 &p))
3134 goto onError;
3135 /* The remaining input chars are ignored if the callback
3136 chooses to skip the input */
3137 }
3138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139
3140 if (byteorder)
3141 *byteorder = bo;
3142
Walter Dörwald69652032004-09-07 20:24:22 +00003143 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003145
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003147 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 goto onError;
3149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003150 Py_XDECREF(errorHandler);
3151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 return (PyObject *)unicode;
3153
Benjamin Peterson29060642009-01-31 22:14:21 +00003154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 Py_XDECREF(errorHandler);
3157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 return NULL;
3159}
3160
Antoine Pitrouab868312009-01-10 15:40:25 +00003161#undef FAST_CHAR_MASK
3162#undef SWAPPED_FAST_CHAR_MASK
3163
Tim Peters772747b2001-08-09 22:21:55 +00003164PyObject *
3165PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 Py_ssize_t size,
3167 const char *errors,
3168 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003170 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003171 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003172 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003173#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003174 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003175#else
3176 const int pairs = 0;
3177#endif
Tim Peters772747b2001-08-09 22:21:55 +00003178 /* Offsets from p for storing byte pairs in the right order. */
3179#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3180 int ihi = 1, ilo = 0;
3181#else
3182 int ihi = 0, ilo = 1;
3183#endif
3184
Benjamin Peterson29060642009-01-31 22:14:21 +00003185#define STORECHAR(CH) \
3186 do { \
3187 p[ihi] = ((CH) >> 8) & 0xff; \
3188 p[ilo] = (CH) & 0xff; \
3189 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003190 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003192#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003193 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 if (s[i] >= 0x10000)
3195 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003196#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003197 /* 2 * (size + pairs + (byteorder == 0)) */
3198 if (size > PY_SSIZE_T_MAX ||
3199 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003201 nsize = size + pairs + (byteorder == 0);
3202 bytesize = nsize * 2;
3203 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003204 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003205 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 if (v == NULL)
3207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003209 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003212 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003213 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003214
3215 if (byteorder == -1) {
3216 /* force LE */
3217 ihi = 1;
3218 ilo = 0;
3219 }
3220 else if (byteorder == 1) {
3221 /* force BE */
3222 ihi = 0;
3223 ilo = 1;
3224 }
3225
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003226 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 Py_UNICODE ch = *s++;
3228 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003229#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 if (ch >= 0x10000) {
3231 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3232 ch = 0xD800 | ((ch-0x10000) >> 10);
3233 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003234#endif
Tim Peters772747b2001-08-09 22:21:55 +00003235 STORECHAR(ch);
3236 if (ch2)
3237 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003238 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003239
3240 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003241 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003242#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243}
3244
3245PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3246{
3247 if (!PyUnicode_Check(unicode)) {
3248 PyErr_BadArgument();
3249 return NULL;
3250 }
3251 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 PyUnicode_GET_SIZE(unicode),
3253 NULL,
3254 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255}
3256
3257/* --- Unicode Escape Codec ----------------------------------------------- */
3258
Fredrik Lundh06d12682001-01-24 07:59:11 +00003259static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003260
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 Py_ssize_t size,
3263 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003266 Py_ssize_t startinpos;
3267 Py_ssize_t endinpos;
3268 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003273 char* message;
3274 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 PyObject *errorHandler = NULL;
3276 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003277
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278 /* Escaped strings will always be longer than the resulting
3279 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 length after conversion to the true value.
3281 (but if the error callback returns a long replacement string
3282 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283 v = _PyUnicode_New(size);
3284 if (v == NULL)
3285 goto onError;
3286 if (size == 0)
3287 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003291
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 while (s < end) {
3293 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003294 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296
3297 /* Non-escape characters are interpreted as Unicode ordinals */
3298 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003299 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 continue;
3301 }
3302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003303 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 /* \ - Escapes */
3305 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003306 c = *s++;
3307 if (s > end)
3308 c = '\0'; /* Invalid after \ */
3309 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 case '\n': break;
3313 case '\\': *p++ = '\\'; break;
3314 case '\'': *p++ = '\''; break;
3315 case '\"': *p++ = '\"'; break;
3316 case 'b': *p++ = '\b'; break;
3317 case 'f': *p++ = '\014'; break; /* FF */
3318 case 't': *p++ = '\t'; break;
3319 case 'n': *p++ = '\n'; break;
3320 case 'r': *p++ = '\r'; break;
3321 case 'v': *p++ = '\013'; break; /* VT */
3322 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3323
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 case '0': case '1': case '2': case '3':
3326 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003327 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003328 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003329 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003330 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003331 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003333 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 break;
3335
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 /* hex escapes */
3337 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003339 digits = 2;
3340 message = "truncated \\xXX escape";
3341 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003345 digits = 4;
3346 message = "truncated \\uXXXX escape";
3347 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348
Benjamin Peterson29060642009-01-31 22:14:21 +00003349 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003350 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003351 digits = 8;
3352 message = "truncated \\UXXXXXXXX escape";
3353 hexescape:
3354 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355 outpos = p-PyUnicode_AS_UNICODE(v);
3356 if (s+digits>end) {
3357 endinpos = size;
3358 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 errors, &errorHandler,
3360 "unicodeescape", "end of string in escape sequence",
3361 &starts, &end, &startinpos, &endinpos, &exc, &s,
3362 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 goto onError;
3364 goto nextByte;
3365 }
3366 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003367 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003368 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 endinpos = (s+i+1)-starts;
3370 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 errors, &errorHandler,
3372 "unicodeescape", message,
3373 &starts, &end, &startinpos, &endinpos, &exc, &s,
3374 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003375 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003376 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003377 }
3378 chr = (chr<<4) & ~0xF;
3379 if (c >= '0' && c <= '9')
3380 chr += c - '0';
3381 else if (c >= 'a' && c <= 'f')
3382 chr += 10 + c - 'a';
3383 else
3384 chr += 10 + c - 'A';
3385 }
3386 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003387 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003388 /* _decoding_error will have already written into the
3389 target buffer. */
3390 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003391 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003392 /* when we get here, chr is a 32-bit unicode character */
3393 if (chr <= 0xffff)
3394 /* UCS-2 character */
3395 *p++ = (Py_UNICODE) chr;
3396 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003397 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003398 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003399#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003400 *p++ = chr;
3401#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003402 chr -= 0x10000L;
3403 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003404 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003405#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003406 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 endinpos = s-starts;
3408 outpos = p-PyUnicode_AS_UNICODE(v);
3409 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 errors, &errorHandler,
3411 "unicodeescape", "illegal Unicode character",
3412 &starts, &end, &startinpos, &endinpos, &exc, &s,
3413 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003414 goto onError;
3415 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003416 break;
3417
Benjamin Peterson29060642009-01-31 22:14:21 +00003418 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003419 case 'N':
3420 message = "malformed \\N character escape";
3421 if (ucnhash_CAPI == NULL) {
3422 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003423 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003424 if (ucnhash_CAPI == NULL)
3425 goto ucnhashError;
3426 }
3427 if (*s == '{') {
3428 const char *start = s+1;
3429 /* look for the closing brace */
3430 while (*s != '}' && s < end)
3431 s++;
3432 if (s > start && s < end && *s == '}') {
3433 /* found a name. look it up in the unicode database */
3434 message = "unknown Unicode character name";
3435 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003436 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003437 goto store;
3438 }
3439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 endinpos = s-starts;
3441 outpos = p-PyUnicode_AS_UNICODE(v);
3442 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 errors, &errorHandler,
3444 "unicodeescape", message,
3445 &starts, &end, &startinpos, &endinpos, &exc, &s,
3446 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003447 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003448 break;
3449
3450 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003451 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 message = "\\ at end of string";
3453 s--;
3454 endinpos = s-starts;
3455 outpos = p-PyUnicode_AS_UNICODE(v);
3456 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003457 errors, &errorHandler,
3458 "unicodeescape", message,
3459 &starts, &end, &startinpos, &endinpos, &exc, &s,
3460 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003461 goto onError;
3462 }
3463 else {
3464 *p++ = '\\';
3465 *p++ = (unsigned char)s[-1];
3466 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003467 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003469 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003472 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003474 Py_XDECREF(errorHandler);
3475 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003477
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003479 PyErr_SetString(
3480 PyExc_UnicodeError,
3481 "\\N escapes not supported (can't load unicodedata module)"
3482 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003483 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 Py_XDECREF(errorHandler);
3485 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003486 return NULL;
3487
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 Py_XDECREF(errorHandler);
3491 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 return NULL;
3493}
3494
3495/* Return a Unicode-Escape string version of the Unicode object.
3496
3497 If quotes is true, the string is enclosed in u"" or u'' quotes as
3498 appropriate.
3499
3500*/
3501
Thomas Wouters477c8d52006-05-27 19:21:47 +00003502Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 Py_ssize_t size,
3504 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003505{
3506 /* like wcschr, but doesn't stop at NULL characters */
3507
3508 while (size-- > 0) {
3509 if (*s == ch)
3510 return s;
3511 s++;
3512 }
3513
3514 return NULL;
3515}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003516
Walter Dörwald79e913e2007-05-12 11:08:06 +00003517static const char *hexdigits = "0123456789abcdef";
3518
3519PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003522 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003525#ifdef Py_UNICODE_WIDE
3526 const Py_ssize_t expandsize = 10;
3527#else
3528 const Py_ssize_t expandsize = 6;
3529#endif
3530
Thomas Wouters89f507f2006-12-13 04:49:30 +00003531 /* XXX(nnorwitz): rather than over-allocating, it would be
3532 better to choose a different scheme. Perhaps scan the
3533 first N-chars of the string and allocate based on that size.
3534 */
3535 /* Initial allocation is based on the longest-possible unichr
3536 escape.
3537
3538 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3539 unichr, so in this case it's the longest unichr escape. In
3540 narrow (UTF-16) builds this is five chars per source unichr
3541 since there are two unichrs in the surrogate pair, so in narrow
3542 (UTF-16) builds it's not the longest unichr escape.
3543
3544 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3545 so in the narrow (UTF-16) build case it's the longest unichr
3546 escape.
3547 */
3548
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003549 if (size == 0)
3550 return PyBytes_FromStringAndSize(NULL, 0);
3551
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003552 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003554
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003555 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003556 2
3557 + expandsize*size
3558 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 if (repr == NULL)
3560 return NULL;
3561
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003562 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 while (size-- > 0) {
3565 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003566
Walter Dörwald79e913e2007-05-12 11:08:06 +00003567 /* Escape backslashes */
3568 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 *p++ = '\\';
3570 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003571 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003572 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003573
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003574#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003575 /* Map 21-bit characters to '\U00xxxxxx' */
3576 else if (ch >= 0x10000) {
3577 *p++ = '\\';
3578 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003579 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3580 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3581 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3582 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3583 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3584 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3585 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3586 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003588 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003589#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3591 else if (ch >= 0xD800 && ch < 0xDC00) {
3592 Py_UNICODE ch2;
3593 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003594
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 ch2 = *s++;
3596 size--;
Georg Brandl0f147092010-08-01 20:54:22 +00003597 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3599 *p++ = '\\';
3600 *p++ = 'U';
3601 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3602 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3603 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3604 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3605 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3606 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3607 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3608 *p++ = hexdigits[ucs & 0x0000000F];
3609 continue;
3610 }
3611 /* Fall through: isolated surrogates are copied as-is */
3612 s--;
3613 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003614 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003615#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003616
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003618 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 *p++ = '\\';
3620 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003621 *p++ = hexdigits[(ch >> 12) & 0x000F];
3622 *p++ = hexdigits[(ch >> 8) & 0x000F];
3623 *p++ = hexdigits[(ch >> 4) & 0x000F];
3624 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003626
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003627 /* Map special whitespace to '\t', \n', '\r' */
3628 else if (ch == '\t') {
3629 *p++ = '\\';
3630 *p++ = 't';
3631 }
3632 else if (ch == '\n') {
3633 *p++ = '\\';
3634 *p++ = 'n';
3635 }
3636 else if (ch == '\r') {
3637 *p++ = '\\';
3638 *p++ = 'r';
3639 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003640
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003641 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003642 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003644 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003645 *p++ = hexdigits[(ch >> 4) & 0x000F];
3646 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003647 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 /* Copy everything else as-is */
3650 else
3651 *p++ = (char) ch;
3652 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003654 assert(p - PyBytes_AS_STRING(repr) > 0);
3655 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3656 return NULL;
3657 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658}
3659
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003660PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003662 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 if (!PyUnicode_Check(unicode)) {
3664 PyErr_BadArgument();
3665 return NULL;
3666 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003667 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3668 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003669 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670}
3671
3672/* --- Raw Unicode Escape Codec ------------------------------------------- */
3673
3674PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 Py_ssize_t size,
3676 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003679 Py_ssize_t startinpos;
3680 Py_ssize_t endinpos;
3681 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 const char *end;
3685 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 PyObject *errorHandler = NULL;
3687 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003688
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 /* Escaped strings will always be longer than the resulting
3690 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 length after conversion to the true value. (But decoding error
3692 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 v = _PyUnicode_New(size);
3694 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 end = s + size;
3700 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 unsigned char c;
3702 Py_UCS4 x;
3703 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003704 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705
Benjamin Peterson29060642009-01-31 22:14:21 +00003706 /* Non-escape characters are interpreted as Unicode ordinals */
3707 if (*s != '\\') {
3708 *p++ = (unsigned char)*s++;
3709 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003711 startinpos = s-starts;
3712
3713 /* \u-escapes are only interpreted iff the number of leading
3714 backslashes if odd */
3715 bs = s;
3716 for (;s < end;) {
3717 if (*s != '\\')
3718 break;
3719 *p++ = (unsigned char)*s++;
3720 }
3721 if (((s - bs) & 1) == 0 ||
3722 s >= end ||
3723 (*s != 'u' && *s != 'U')) {
3724 continue;
3725 }
3726 p--;
3727 count = *s=='u' ? 4 : 8;
3728 s++;
3729
3730 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3731 outpos = p-PyUnicode_AS_UNICODE(v);
3732 for (x = 0, i = 0; i < count; ++i, ++s) {
3733 c = (unsigned char)*s;
3734 if (!ISXDIGIT(c)) {
3735 endinpos = s-starts;
3736 if (unicode_decode_call_errorhandler(
3737 errors, &errorHandler,
3738 "rawunicodeescape", "truncated \\uXXXX",
3739 &starts, &end, &startinpos, &endinpos, &exc, &s,
3740 &v, &outpos, &p))
3741 goto onError;
3742 goto nextByte;
3743 }
3744 x = (x<<4) & ~0xF;
3745 if (c >= '0' && c <= '9')
3746 x += c - '0';
3747 else if (c >= 'a' && c <= 'f')
3748 x += 10 + c - 'a';
3749 else
3750 x += 10 + c - 'A';
3751 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003752 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003753 /* UCS-2 character */
3754 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003755 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 /* UCS-4 character. Either store directly, or as
3757 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003758#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003760#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003761 x -= 0x10000L;
3762 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3763 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003764#endif
3765 } else {
3766 endinpos = s-starts;
3767 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003768 if (unicode_decode_call_errorhandler(
3769 errors, &errorHandler,
3770 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003771 &starts, &end, &startinpos, &endinpos, &exc, &s,
3772 &v, &outpos, &p))
3773 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003774 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003775 nextByte:
3776 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003778 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003779 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 Py_XDECREF(errorHandler);
3781 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003783
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 Py_XDECREF(errorHandler);
3787 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 return NULL;
3789}
3790
3791PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003792 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003794 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 char *p;
3796 char *q;
3797
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003798#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003799 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003800#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003801 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003802#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003803
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003804 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003806
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003807 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 if (repr == NULL)
3809 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003810 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003811 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003813 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 while (size-- > 0) {
3815 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003816#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 /* Map 32-bit characters to '\Uxxxxxxxx' */
3818 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003819 *p++ = '\\';
3820 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003821 *p++ = hexdigits[(ch >> 28) & 0xf];
3822 *p++ = hexdigits[(ch >> 24) & 0xf];
3823 *p++ = hexdigits[(ch >> 20) & 0xf];
3824 *p++ = hexdigits[(ch >> 16) & 0xf];
3825 *p++ = hexdigits[(ch >> 12) & 0xf];
3826 *p++ = hexdigits[(ch >> 8) & 0xf];
3827 *p++ = hexdigits[(ch >> 4) & 0xf];
3828 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003829 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003830 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003831#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3833 if (ch >= 0xD800 && ch < 0xDC00) {
3834 Py_UNICODE ch2;
3835 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003836
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 ch2 = *s++;
3838 size--;
Georg Brandl0f147092010-08-01 20:54:22 +00003839 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3841 *p++ = '\\';
3842 *p++ = 'U';
3843 *p++ = hexdigits[(ucs >> 28) & 0xf];
3844 *p++ = hexdigits[(ucs >> 24) & 0xf];
3845 *p++ = hexdigits[(ucs >> 20) & 0xf];
3846 *p++ = hexdigits[(ucs >> 16) & 0xf];
3847 *p++ = hexdigits[(ucs >> 12) & 0xf];
3848 *p++ = hexdigits[(ucs >> 8) & 0xf];
3849 *p++ = hexdigits[(ucs >> 4) & 0xf];
3850 *p++ = hexdigits[ucs & 0xf];
3851 continue;
3852 }
3853 /* Fall through: isolated surrogates are copied as-is */
3854 s--;
3855 size++;
3856 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003857#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003858 /* Map 16-bit characters to '\uxxxx' */
3859 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 *p++ = '\\';
3861 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003862 *p++ = hexdigits[(ch >> 12) & 0xf];
3863 *p++ = hexdigits[(ch >> 8) & 0xf];
3864 *p++ = hexdigits[(ch >> 4) & 0xf];
3865 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 /* Copy everything else as-is */
3868 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 *p++ = (char) ch;
3870 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003871 size = p - q;
3872
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003873 assert(size > 0);
3874 if (_PyBytes_Resize(&repr, size) < 0)
3875 return NULL;
3876 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877}
3878
3879PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3880{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003881 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003883 PyErr_BadArgument();
3884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003886 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3887 PyUnicode_GET_SIZE(unicode));
3888
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003889 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890}
3891
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003892/* --- Unicode Internal Codec ------------------------------------------- */
3893
3894PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003895 Py_ssize_t size,
3896 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003897{
3898 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003899 Py_ssize_t startinpos;
3900 Py_ssize_t endinpos;
3901 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003902 PyUnicodeObject *v;
3903 Py_UNICODE *p;
3904 const char *end;
3905 const char *reason;
3906 PyObject *errorHandler = NULL;
3907 PyObject *exc = NULL;
3908
Neal Norwitzd43069c2006-01-08 01:12:10 +00003909#ifdef Py_UNICODE_WIDE
3910 Py_UNICODE unimax = PyUnicode_GetMax();
3911#endif
3912
Thomas Wouters89f507f2006-12-13 04:49:30 +00003913 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003914 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3915 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003916 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003917 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003919 p = PyUnicode_AS_UNICODE(v);
3920 end = s + size;
3921
3922 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003923 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003924 /* We have to sanity check the raw data, otherwise doom looms for
3925 some malformed UCS-4 data. */
3926 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003927#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003928 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003929#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003930 end-s < Py_UNICODE_SIZE
3931 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003932 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003933 startinpos = s - starts;
3934 if (end-s < Py_UNICODE_SIZE) {
3935 endinpos = end-starts;
3936 reason = "truncated input";
3937 }
3938 else {
3939 endinpos = s - starts + Py_UNICODE_SIZE;
3940 reason = "illegal code point (> 0x10FFFF)";
3941 }
3942 outpos = p - PyUnicode_AS_UNICODE(v);
3943 if (unicode_decode_call_errorhandler(
3944 errors, &errorHandler,
3945 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003946 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003947 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003948 goto onError;
3949 }
3950 }
3951 else {
3952 p++;
3953 s += Py_UNICODE_SIZE;
3954 }
3955 }
3956
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003957 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003958 goto onError;
3959 Py_XDECREF(errorHandler);
3960 Py_XDECREF(exc);
3961 return (PyObject *)v;
3962
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003964 Py_XDECREF(v);
3965 Py_XDECREF(errorHandler);
3966 Py_XDECREF(exc);
3967 return NULL;
3968}
3969
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970/* --- Latin-1 Codec ------------------------------------------------------ */
3971
3972PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003973 Py_ssize_t size,
3974 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975{
3976 PyUnicodeObject *v;
3977 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003978 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003979
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003981 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 Py_UNICODE r = *(unsigned char*)s;
3983 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003984 }
3985
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 v = _PyUnicode_New(size);
3987 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003992 e = s + size;
3993 /* Unrolling the copy makes it much faster by reducing the looping
3994 overhead. This is similar to what many memcpy() implementations do. */
3995 unrolled_end = e - 4;
3996 while (s < unrolled_end) {
3997 p[0] = (unsigned char) s[0];
3998 p[1] = (unsigned char) s[1];
3999 p[2] = (unsigned char) s[2];
4000 p[3] = (unsigned char) s[3];
4001 s += 4;
4002 p += 4;
4003 }
4004 while (s < e)
4005 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004007
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 Py_XDECREF(v);
4010 return NULL;
4011}
4012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013/* create or adjust a UnicodeEncodeError */
4014static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 const char *encoding,
4016 const Py_UNICODE *unicode, Py_ssize_t size,
4017 Py_ssize_t startpos, Py_ssize_t endpos,
4018 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004021 *exceptionObject = PyUnicodeEncodeError_Create(
4022 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 }
4024 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4026 goto onError;
4027 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4028 goto onError;
4029 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4030 goto onError;
4031 return;
4032 onError:
4033 Py_DECREF(*exceptionObject);
4034 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 }
4036}
4037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038/* raises a UnicodeEncodeError */
4039static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 const char *encoding,
4041 const Py_UNICODE *unicode, Py_ssize_t size,
4042 Py_ssize_t startpos, Py_ssize_t endpos,
4043 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044{
4045 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004049}
4050
4051/* error handling callback helper:
4052 build arguments, call the callback and check the arguments,
4053 put the result into newpos and return the replacement string, which
4054 has to be freed by the caller */
4055static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 PyObject **errorHandler,
4057 const char *encoding, const char *reason,
4058 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4059 Py_ssize_t startpos, Py_ssize_t endpos,
4060 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004062 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063
4064 PyObject *restuple;
4065 PyObject *resunicode;
4066
4067 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004069 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 }
4072
4073 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077
4078 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004083 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 Py_DECREF(restuple);
4085 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004087 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 &resunicode, newpos)) {
4089 Py_DECREF(restuple);
4090 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004092 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4093 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4094 Py_DECREF(restuple);
4095 return NULL;
4096 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004098 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004099 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4101 Py_DECREF(restuple);
4102 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004103 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 Py_INCREF(resunicode);
4105 Py_DECREF(restuple);
4106 return resunicode;
4107}
4108
4109static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 Py_ssize_t size,
4111 const char *errors,
4112 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113{
4114 /* output object */
4115 PyObject *res;
4116 /* pointers to the beginning and end+1 of input */
4117 const Py_UNICODE *startp = p;
4118 const Py_UNICODE *endp = p + size;
4119 /* pointer to the beginning of the unencodable characters */
4120 /* const Py_UNICODE *badp = NULL; */
4121 /* pointer into the output */
4122 char *str;
4123 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004125 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4126 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 PyObject *errorHandler = NULL;
4128 PyObject *exc = NULL;
4129 /* the following variable is used for caching string comparisons
4130 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4131 int known_errorHandler = -1;
4132
4133 /* allocate enough for a simple encoding without
4134 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004135 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004136 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004137 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004139 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004140 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 ressize = size;
4142
4143 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 /* can we encode this? */
4147 if (c<limit) {
4148 /* no overflow check, because we know that the space is enough */
4149 *str++ = (char)c;
4150 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 else {
4153 Py_ssize_t unicodepos = p-startp;
4154 Py_ssize_t requiredsize;
4155 PyObject *repunicode;
4156 Py_ssize_t repsize;
4157 Py_ssize_t newpos;
4158 Py_ssize_t respos;
4159 Py_UNICODE *uni2;
4160 /* startpos for collecting unencodable chars */
4161 const Py_UNICODE *collstart = p;
4162 const Py_UNICODE *collend = p;
4163 /* find all unecodable characters */
4164 while ((collend < endp) && ((*collend)>=limit))
4165 ++collend;
4166 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4167 if (known_errorHandler==-1) {
4168 if ((errors==NULL) || (!strcmp(errors, "strict")))
4169 known_errorHandler = 1;
4170 else if (!strcmp(errors, "replace"))
4171 known_errorHandler = 2;
4172 else if (!strcmp(errors, "ignore"))
4173 known_errorHandler = 3;
4174 else if (!strcmp(errors, "xmlcharrefreplace"))
4175 known_errorHandler = 4;
4176 else
4177 known_errorHandler = 0;
4178 }
4179 switch (known_errorHandler) {
4180 case 1: /* strict */
4181 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4182 goto onError;
4183 case 2: /* replace */
4184 while (collstart++<collend)
4185 *str++ = '?'; /* fall through */
4186 case 3: /* ignore */
4187 p = collend;
4188 break;
4189 case 4: /* xmlcharrefreplace */
4190 respos = str - PyBytes_AS_STRING(res);
4191 /* determine replacement size (temporarily (mis)uses p) */
4192 for (p = collstart, repsize = 0; p < collend; ++p) {
4193 if (*p<10)
4194 repsize += 2+1+1;
4195 else if (*p<100)
4196 repsize += 2+2+1;
4197 else if (*p<1000)
4198 repsize += 2+3+1;
4199 else if (*p<10000)
4200 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004201#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 else
4203 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004204#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 else if (*p<100000)
4206 repsize += 2+5+1;
4207 else if (*p<1000000)
4208 repsize += 2+6+1;
4209 else
4210 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004211#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 }
4213 requiredsize = respos+repsize+(endp-collend);
4214 if (requiredsize > ressize) {
4215 if (requiredsize<2*ressize)
4216 requiredsize = 2*ressize;
4217 if (_PyBytes_Resize(&res, requiredsize))
4218 goto onError;
4219 str = PyBytes_AS_STRING(res) + respos;
4220 ressize = requiredsize;
4221 }
4222 /* generate replacement (temporarily (mis)uses p) */
4223 for (p = collstart; p < collend; ++p) {
4224 str += sprintf(str, "&#%d;", (int)*p);
4225 }
4226 p = collend;
4227 break;
4228 default:
4229 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4230 encoding, reason, startp, size, &exc,
4231 collstart-startp, collend-startp, &newpos);
4232 if (repunicode == NULL)
4233 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004234 if (PyBytes_Check(repunicode)) {
4235 /* Directly copy bytes result to output. */
4236 repsize = PyBytes_Size(repunicode);
4237 if (repsize > 1) {
4238 /* Make room for all additional bytes. */
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004239 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004240 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4241 Py_DECREF(repunicode);
4242 goto onError;
4243 }
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004244 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004245 ressize += repsize-1;
4246 }
4247 memcpy(str, PyBytes_AsString(repunicode), repsize);
4248 str += repsize;
4249 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004250 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004251 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 /* need more space? (at least enough for what we
4254 have+the replacement+the rest of the string, so
4255 we won't have to check space for encodable characters) */
4256 respos = str - PyBytes_AS_STRING(res);
4257 repsize = PyUnicode_GET_SIZE(repunicode);
4258 requiredsize = respos+repsize+(endp-collend);
4259 if (requiredsize > ressize) {
4260 if (requiredsize<2*ressize)
4261 requiredsize = 2*ressize;
4262 if (_PyBytes_Resize(&res, requiredsize)) {
4263 Py_DECREF(repunicode);
4264 goto onError;
4265 }
4266 str = PyBytes_AS_STRING(res) + respos;
4267 ressize = requiredsize;
4268 }
4269 /* check if there is anything unencodable in the replacement
4270 and copy it to the output */
4271 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4272 c = *uni2;
4273 if (c >= limit) {
4274 raise_encode_exception(&exc, encoding, startp, size,
4275 unicodepos, unicodepos+1, reason);
4276 Py_DECREF(repunicode);
4277 goto onError;
4278 }
4279 *str = (char)c;
4280 }
4281 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004282 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004284 }
4285 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004286 /* Resize if we allocated to much */
4287 size = str - PyBytes_AS_STRING(res);
4288 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004289 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004290 if (_PyBytes_Resize(&res, size) < 0)
4291 goto onError;
4292 }
4293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004294 Py_XDECREF(errorHandler);
4295 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004296 return res;
4297
4298 onError:
4299 Py_XDECREF(res);
4300 Py_XDECREF(errorHandler);
4301 Py_XDECREF(exc);
4302 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303}
4304
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004306 Py_ssize_t size,
4307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310}
4311
4312PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4313{
4314 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 PyErr_BadArgument();
4316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 }
4318 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004319 PyUnicode_GET_SIZE(unicode),
4320 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321}
4322
4323/* --- 7-bit ASCII Codec -------------------------------------------------- */
4324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 Py_ssize_t size,
4327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 PyUnicodeObject *v;
4331 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004332 Py_ssize_t startinpos;
4333 Py_ssize_t endinpos;
4334 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 const char *e;
4336 PyObject *errorHandler = NULL;
4337 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004338
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004340 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 Py_UNICODE r = *(unsigned char*)s;
4342 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004343 }
Tim Petersced69f82003-09-16 20:30:58 +00004344
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 v = _PyUnicode_New(size);
4346 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351 e = s + size;
4352 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 register unsigned char c = (unsigned char)*s;
4354 if (c < 128) {
4355 *p++ = c;
4356 ++s;
4357 }
4358 else {
4359 startinpos = s-starts;
4360 endinpos = startinpos + 1;
4361 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4362 if (unicode_decode_call_errorhandler(
4363 errors, &errorHandler,
4364 "ascii", "ordinal not in range(128)",
4365 &starts, &e, &startinpos, &endinpos, &exc, &s,
4366 &v, &outpos, &p))
4367 goto onError;
4368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004370 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4372 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373 Py_XDECREF(errorHandler);
4374 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004376
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 Py_XDECREF(errorHandler);
4380 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 return NULL;
4382}
4383
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 Py_ssize_t size,
4386 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389}
4390
4391PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4392{
4393 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 PyErr_BadArgument();
4395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 }
4397 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 PyUnicode_GET_SIZE(unicode),
4399 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400}
4401
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004402#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004403
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004404/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004405
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004406#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004407#define NEED_RETRY
4408#endif
4409
4410/* XXX This code is limited to "true" double-byte encodings, as
4411 a) it assumes an incomplete character consists of a single byte, and
4412 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004414
4415static int is_dbcs_lead_byte(const char *s, int offset)
4416{
4417 const char *curr = s + offset;
4418
4419 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 const char *prev = CharPrev(s, curr);
4421 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004422 }
4423 return 0;
4424}
4425
4426/*
4427 * Decode MBCS string into unicode object. If 'final' is set, converts
4428 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4429 */
4430static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 const char *s, /* MBCS string */
4432 int size, /* sizeof MBCS string */
4433 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004434{
4435 Py_UNICODE *p;
4436 Py_ssize_t n = 0;
4437 int usize = 0;
4438
4439 assert(size >= 0);
4440
4441 /* Skip trailing lead-byte unless 'final' is set */
4442 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004444
4445 /* First get the size of the result */
4446 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004447 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4448 if (usize == 0) {
4449 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4450 return -1;
4451 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004452 }
4453
4454 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 /* Create unicode object */
4456 *v = _PyUnicode_New(usize);
4457 if (*v == NULL)
4458 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004459 }
4460 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 /* Extend unicode object */
4462 n = PyUnicode_GET_SIZE(*v);
4463 if (_PyUnicode_Resize(v, n + usize) < 0)
4464 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004465 }
4466
4467 /* Do the conversion */
4468 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 p = PyUnicode_AS_UNICODE(*v) + n;
4470 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4471 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4472 return -1;
4473 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004474 }
4475
4476 return size;
4477}
4478
4479PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 Py_ssize_t size,
4481 const char *errors,
4482 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004483{
4484 PyUnicodeObject *v = NULL;
4485 int done;
4486
4487 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004489
4490#ifdef NEED_RETRY
4491 retry:
4492 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004494 else
4495#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004497
4498 if (done < 0) {
4499 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004501 }
4502
4503 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004505
4506#ifdef NEED_RETRY
4507 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 s += done;
4509 size -= done;
4510 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004511 }
4512#endif
4513
4514 return (PyObject *)v;
4515}
4516
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004517PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 Py_ssize_t size,
4519 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004520{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004521 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4522}
4523
4524/*
4525 * Convert unicode into string object (MBCS).
4526 * Returns 0 if succeed, -1 otherwise.
4527 */
4528static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 const Py_UNICODE *p, /* unicode */
4530 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004531{
4532 int mbcssize = 0;
4533 Py_ssize_t n = 0;
4534
4535 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004536
4537 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004538 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4540 if (mbcssize == 0) {
4541 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4542 return -1;
4543 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004544 }
4545
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004546 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 /* Create string object */
4548 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4549 if (*repr == NULL)
4550 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004551 }
4552 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 /* Extend string object */
4554 n = PyBytes_Size(*repr);
4555 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4556 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004557 }
4558
4559 /* Do the conversion */
4560 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 char *s = PyBytes_AS_STRING(*repr) + n;
4562 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4563 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4564 return -1;
4565 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004566 }
4567
4568 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004569}
4570
4571PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 Py_ssize_t size,
4573 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004574{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004575 PyObject *repr = NULL;
4576 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004577
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004578#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004580 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004582 else
4583#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004585
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004586 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_XDECREF(repr);
4588 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004589 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004590
4591#ifdef NEED_RETRY
4592 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 p += INT_MAX;
4594 size -= INT_MAX;
4595 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004596 }
4597#endif
4598
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004599 return repr;
4600}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004601
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004602PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4603{
4604 if (!PyUnicode_Check(unicode)) {
4605 PyErr_BadArgument();
4606 return NULL;
4607 }
4608 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 PyUnicode_GET_SIZE(unicode),
4610 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004611}
4612
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004613#undef NEED_RETRY
4614
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004615#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004616
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617/* --- Character Mapping Codec -------------------------------------------- */
4618
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 Py_ssize_t size,
4621 PyObject *mapping,
4622 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004625 Py_ssize_t startinpos;
4626 Py_ssize_t endinpos;
4627 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 PyUnicodeObject *v;
4630 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004631 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632 PyObject *errorHandler = NULL;
4633 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004634 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004635 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004636
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 /* Default to Latin-1 */
4638 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640
4641 v = _PyUnicode_New(size);
4642 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004647 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004648 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 mapstring = PyUnicode_AS_UNICODE(mapping);
4650 maplen = PyUnicode_GET_SIZE(mapping);
4651 while (s < e) {
4652 unsigned char ch = *s;
4653 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 if (ch < maplen)
4656 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 if (x == 0xfffe) {
4659 /* undefined mapping */
4660 outpos = p-PyUnicode_AS_UNICODE(v);
4661 startinpos = s-starts;
4662 endinpos = startinpos+1;
4663 if (unicode_decode_call_errorhandler(
4664 errors, &errorHandler,
4665 "charmap", "character maps to <undefined>",
4666 &starts, &e, &startinpos, &endinpos, &exc, &s,
4667 &v, &outpos, &p)) {
4668 goto onError;
4669 }
4670 continue;
4671 }
4672 *p++ = x;
4673 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004675 }
4676 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 while (s < e) {
4678 unsigned char ch = *s;
4679 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004680
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4682 w = PyLong_FromLong((long)ch);
4683 if (w == NULL)
4684 goto onError;
4685 x = PyObject_GetItem(mapping, w);
4686 Py_DECREF(w);
4687 if (x == NULL) {
4688 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4689 /* No mapping found means: mapping is undefined. */
4690 PyErr_Clear();
4691 x = Py_None;
4692 Py_INCREF(x);
4693 } else
4694 goto onError;
4695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004696
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 /* Apply mapping */
4698 if (PyLong_Check(x)) {
4699 long value = PyLong_AS_LONG(x);
4700 if (value < 0 || value > 65535) {
4701 PyErr_SetString(PyExc_TypeError,
4702 "character mapping must be in range(65536)");
4703 Py_DECREF(x);
4704 goto onError;
4705 }
4706 *p++ = (Py_UNICODE)value;
4707 }
4708 else if (x == Py_None) {
4709 /* undefined mapping */
4710 outpos = p-PyUnicode_AS_UNICODE(v);
4711 startinpos = s-starts;
4712 endinpos = startinpos+1;
4713 if (unicode_decode_call_errorhandler(
4714 errors, &errorHandler,
4715 "charmap", "character maps to <undefined>",
4716 &starts, &e, &startinpos, &endinpos, &exc, &s,
4717 &v, &outpos, &p)) {
4718 Py_DECREF(x);
4719 goto onError;
4720 }
4721 Py_DECREF(x);
4722 continue;
4723 }
4724 else if (PyUnicode_Check(x)) {
4725 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004726
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 if (targetsize == 1)
4728 /* 1-1 mapping */
4729 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004730
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 else if (targetsize > 1) {
4732 /* 1-n mapping */
4733 if (targetsize > extrachars) {
4734 /* resize first */
4735 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4736 Py_ssize_t needed = (targetsize - extrachars) + \
4737 (targetsize << 2);
4738 extrachars += needed;
4739 /* XXX overflow detection missing */
4740 if (_PyUnicode_Resize(&v,
4741 PyUnicode_GET_SIZE(v) + needed) < 0) {
4742 Py_DECREF(x);
4743 goto onError;
4744 }
4745 p = PyUnicode_AS_UNICODE(v) + oldpos;
4746 }
4747 Py_UNICODE_COPY(p,
4748 PyUnicode_AS_UNICODE(x),
4749 targetsize);
4750 p += targetsize;
4751 extrachars -= targetsize;
4752 }
4753 /* 1-0 mapping: skip the character */
4754 }
4755 else {
4756 /* wrong return value */
4757 PyErr_SetString(PyExc_TypeError,
4758 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004759 Py_DECREF(x);
4760 goto onError;
4761 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 Py_DECREF(x);
4763 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 }
4766 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4768 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 Py_XDECREF(errorHandler);
4770 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004772
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 Py_XDECREF(errorHandler);
4775 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 Py_XDECREF(v);
4777 return NULL;
4778}
4779
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004780/* Charmap encoding: the lookup table */
4781
4782struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004783 PyObject_HEAD
4784 unsigned char level1[32];
4785 int count2, count3;
4786 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004787};
4788
4789static PyObject*
4790encoding_map_size(PyObject *obj, PyObject* args)
4791{
4792 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004793 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004795}
4796
4797static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004798 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004799 PyDoc_STR("Return the size (in bytes) of this object") },
4800 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004801};
4802
4803static void
4804encoding_map_dealloc(PyObject* o)
4805{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004806 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004807}
4808
4809static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004810 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 "EncodingMap", /*tp_name*/
4812 sizeof(struct encoding_map), /*tp_basicsize*/
4813 0, /*tp_itemsize*/
4814 /* methods */
4815 encoding_map_dealloc, /*tp_dealloc*/
4816 0, /*tp_print*/
4817 0, /*tp_getattr*/
4818 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004819 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 0, /*tp_repr*/
4821 0, /*tp_as_number*/
4822 0, /*tp_as_sequence*/
4823 0, /*tp_as_mapping*/
4824 0, /*tp_hash*/
4825 0, /*tp_call*/
4826 0, /*tp_str*/
4827 0, /*tp_getattro*/
4828 0, /*tp_setattro*/
4829 0, /*tp_as_buffer*/
4830 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4831 0, /*tp_doc*/
4832 0, /*tp_traverse*/
4833 0, /*tp_clear*/
4834 0, /*tp_richcompare*/
4835 0, /*tp_weaklistoffset*/
4836 0, /*tp_iter*/
4837 0, /*tp_iternext*/
4838 encoding_map_methods, /*tp_methods*/
4839 0, /*tp_members*/
4840 0, /*tp_getset*/
4841 0, /*tp_base*/
4842 0, /*tp_dict*/
4843 0, /*tp_descr_get*/
4844 0, /*tp_descr_set*/
4845 0, /*tp_dictoffset*/
4846 0, /*tp_init*/
4847 0, /*tp_alloc*/
4848 0, /*tp_new*/
4849 0, /*tp_free*/
4850 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004851};
4852
4853PyObject*
4854PyUnicode_BuildEncodingMap(PyObject* string)
4855{
4856 Py_UNICODE *decode;
4857 PyObject *result;
4858 struct encoding_map *mresult;
4859 int i;
4860 int need_dict = 0;
4861 unsigned char level1[32];
4862 unsigned char level2[512];
4863 unsigned char *mlevel1, *mlevel2, *mlevel3;
4864 int count2 = 0, count3 = 0;
4865
4866 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4867 PyErr_BadArgument();
4868 return NULL;
4869 }
4870 decode = PyUnicode_AS_UNICODE(string);
4871 memset(level1, 0xFF, sizeof level1);
4872 memset(level2, 0xFF, sizeof level2);
4873
4874 /* If there isn't a one-to-one mapping of NULL to \0,
4875 or if there are non-BMP characters, we need to use
4876 a mapping dictionary. */
4877 if (decode[0] != 0)
4878 need_dict = 1;
4879 for (i = 1; i < 256; i++) {
4880 int l1, l2;
4881 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004882#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004883 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004884#endif
4885 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004886 need_dict = 1;
4887 break;
4888 }
4889 if (decode[i] == 0xFFFE)
4890 /* unmapped character */
4891 continue;
4892 l1 = decode[i] >> 11;
4893 l2 = decode[i] >> 7;
4894 if (level1[l1] == 0xFF)
4895 level1[l1] = count2++;
4896 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004897 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004898 }
4899
4900 if (count2 >= 0xFF || count3 >= 0xFF)
4901 need_dict = 1;
4902
4903 if (need_dict) {
4904 PyObject *result = PyDict_New();
4905 PyObject *key, *value;
4906 if (!result)
4907 return NULL;
4908 for (i = 0; i < 256; i++) {
4909 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004910 key = PyLong_FromLong(decode[i]);
4911 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004912 if (!key || !value)
4913 goto failed1;
4914 if (PyDict_SetItem(result, key, value) == -1)
4915 goto failed1;
4916 Py_DECREF(key);
4917 Py_DECREF(value);
4918 }
4919 return result;
4920 failed1:
4921 Py_XDECREF(key);
4922 Py_XDECREF(value);
4923 Py_DECREF(result);
4924 return NULL;
4925 }
4926
4927 /* Create a three-level trie */
4928 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4929 16*count2 + 128*count3 - 1);
4930 if (!result)
4931 return PyErr_NoMemory();
4932 PyObject_Init(result, &EncodingMapType);
4933 mresult = (struct encoding_map*)result;
4934 mresult->count2 = count2;
4935 mresult->count3 = count3;
4936 mlevel1 = mresult->level1;
4937 mlevel2 = mresult->level23;
4938 mlevel3 = mresult->level23 + 16*count2;
4939 memcpy(mlevel1, level1, 32);
4940 memset(mlevel2, 0xFF, 16*count2);
4941 memset(mlevel3, 0, 128*count3);
4942 count3 = 0;
4943 for (i = 1; i < 256; i++) {
4944 int o1, o2, o3, i2, i3;
4945 if (decode[i] == 0xFFFE)
4946 /* unmapped character */
4947 continue;
4948 o1 = decode[i]>>11;
4949 o2 = (decode[i]>>7) & 0xF;
4950 i2 = 16*mlevel1[o1] + o2;
4951 if (mlevel2[i2] == 0xFF)
4952 mlevel2[i2] = count3++;
4953 o3 = decode[i] & 0x7F;
4954 i3 = 128*mlevel2[i2] + o3;
4955 mlevel3[i3] = i;
4956 }
4957 return result;
4958}
4959
4960static int
4961encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4962{
4963 struct encoding_map *map = (struct encoding_map*)mapping;
4964 int l1 = c>>11;
4965 int l2 = (c>>7) & 0xF;
4966 int l3 = c & 0x7F;
4967 int i;
4968
4969#ifdef Py_UNICODE_WIDE
4970 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004972 }
4973#endif
4974 if (c == 0)
4975 return 0;
4976 /* level 1*/
4977 i = map->level1[l1];
4978 if (i == 0xFF) {
4979 return -1;
4980 }
4981 /* level 2*/
4982 i = map->level23[16*i+l2];
4983 if (i == 0xFF) {
4984 return -1;
4985 }
4986 /* level 3 */
4987 i = map->level23[16*map->count2 + 128*i + l3];
4988 if (i == 0) {
4989 return -1;
4990 }
4991 return i;
4992}
4993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994/* Lookup the character ch in the mapping. If the character
4995 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004996 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004997static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998{
Christian Heimes217cfd12007-12-02 14:31:20 +00004999 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 PyObject *x;
5001
5002 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 x = PyObject_GetItem(mapping, w);
5005 Py_DECREF(w);
5006 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5008 /* No mapping found means: mapping is undefined. */
5009 PyErr_Clear();
5010 x = Py_None;
5011 Py_INCREF(x);
5012 return x;
5013 } else
5014 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005016 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005018 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 long value = PyLong_AS_LONG(x);
5020 if (value < 0 || value > 255) {
5021 PyErr_SetString(PyExc_TypeError,
5022 "character mapping must be in range(256)");
5023 Py_DECREF(x);
5024 return NULL;
5025 }
5026 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005028 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 /* wrong return value */
5032 PyErr_Format(PyExc_TypeError,
5033 "character mapping must return integer, bytes or None, not %.400s",
5034 x->ob_type->tp_name);
5035 Py_DECREF(x);
5036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 }
5038}
5039
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005040static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005041charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005042{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005043 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5044 /* exponentially overallocate to minimize reallocations */
5045 if (requiredsize < 2*outsize)
5046 requiredsize = 2*outsize;
5047 if (_PyBytes_Resize(outobj, requiredsize))
5048 return -1;
5049 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005050}
5051
Benjamin Peterson14339b62009-01-31 16:36:08 +00005052typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005054}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005055/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005056 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 space is available. Return a new reference to the object that
5058 was put in the output buffer, or Py_None, if the mapping was undefined
5059 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005060 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005062charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005064{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005065 PyObject *rep;
5066 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005067 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005068
Christian Heimes90aa7642007-12-19 02:45:37 +00005069 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005070 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005072 if (res == -1)
5073 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 if (outsize<requiredsize)
5075 if (charmapencode_resize(outobj, outpos, requiredsize))
5076 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005077 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 outstart[(*outpos)++] = (char)res;
5079 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005080 }
5081
5082 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005083 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005085 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 Py_DECREF(rep);
5087 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005088 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 if (PyLong_Check(rep)) {
5090 Py_ssize_t requiredsize = *outpos+1;
5091 if (outsize<requiredsize)
5092 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5093 Py_DECREF(rep);
5094 return enc_EXCEPTION;
5095 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005096 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005098 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 else {
5100 const char *repchars = PyBytes_AS_STRING(rep);
5101 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5102 Py_ssize_t requiredsize = *outpos+repsize;
5103 if (outsize<requiredsize)
5104 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5105 Py_DECREF(rep);
5106 return enc_EXCEPTION;
5107 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005108 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 memcpy(outstart + *outpos, repchars, repsize);
5110 *outpos += repsize;
5111 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005113 Py_DECREF(rep);
5114 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115}
5116
5117/* handle an error in PyUnicode_EncodeCharmap
5118 Return 0 on success, -1 on error */
5119static
5120int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005121 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005123 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005124 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125{
5126 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005127 Py_ssize_t repsize;
5128 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 Py_UNICODE *uni2;
5130 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005131 Py_ssize_t collstartpos = *inpos;
5132 Py_ssize_t collendpos = *inpos+1;
5133 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005134 char *encoding = "charmap";
5135 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005136 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 /* find all unencodable characters */
5139 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005140 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005141 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 int res = encoding_map_lookup(p[collendpos], mapping);
5143 if (res != -1)
5144 break;
5145 ++collendpos;
5146 continue;
5147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005148
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 rep = charmapencode_lookup(p[collendpos], mapping);
5150 if (rep==NULL)
5151 return -1;
5152 else if (rep!=Py_None) {
5153 Py_DECREF(rep);
5154 break;
5155 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005156 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005158 }
5159 /* cache callback name lookup
5160 * (if not done yet, i.e. it's the first error) */
5161 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 if ((errors==NULL) || (!strcmp(errors, "strict")))
5163 *known_errorHandler = 1;
5164 else if (!strcmp(errors, "replace"))
5165 *known_errorHandler = 2;
5166 else if (!strcmp(errors, "ignore"))
5167 *known_errorHandler = 3;
5168 else if (!strcmp(errors, "xmlcharrefreplace"))
5169 *known_errorHandler = 4;
5170 else
5171 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172 }
5173 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005174 case 1: /* strict */
5175 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5176 return -1;
5177 case 2: /* replace */
5178 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 x = charmapencode_output('?', mapping, res, respos);
5180 if (x==enc_EXCEPTION) {
5181 return -1;
5182 }
5183 else if (x==enc_FAILED) {
5184 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5185 return -1;
5186 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005187 }
5188 /* fall through */
5189 case 3: /* ignore */
5190 *inpos = collendpos;
5191 break;
5192 case 4: /* xmlcharrefreplace */
5193 /* generate replacement (temporarily (mis)uses p) */
5194 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 char buffer[2+29+1+1];
5196 char *cp;
5197 sprintf(buffer, "&#%d;", (int)p[collpos]);
5198 for (cp = buffer; *cp; ++cp) {
5199 x = charmapencode_output(*cp, mapping, res, respos);
5200 if (x==enc_EXCEPTION)
5201 return -1;
5202 else if (x==enc_FAILED) {
5203 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5204 return -1;
5205 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005206 }
5207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005208 *inpos = collendpos;
5209 break;
5210 default:
5211 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 encoding, reason, p, size, exceptionObject,
5213 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005214 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005216 if (PyBytes_Check(repunicode)) {
5217 /* Directly copy bytes result to output. */
5218 Py_ssize_t outsize = PyBytes_Size(*res);
5219 Py_ssize_t requiredsize;
5220 repsize = PyBytes_Size(repunicode);
5221 requiredsize = *respos + repsize;
5222 if (requiredsize > outsize)
5223 /* Make room for all additional bytes. */
5224 if (charmapencode_resize(res, respos, requiredsize)) {
5225 Py_DECREF(repunicode);
5226 return -1;
5227 }
5228 memcpy(PyBytes_AsString(*res) + *respos,
5229 PyBytes_AsString(repunicode), repsize);
5230 *respos += repsize;
5231 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005232 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005233 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005234 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005235 /* generate replacement */
5236 repsize = PyUnicode_GET_SIZE(repunicode);
5237 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 x = charmapencode_output(*uni2, mapping, res, respos);
5239 if (x==enc_EXCEPTION) {
5240 return -1;
5241 }
5242 else if (x==enc_FAILED) {
5243 Py_DECREF(repunicode);
5244 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5245 return -1;
5246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005247 }
5248 *inpos = newpos;
5249 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 }
5251 return 0;
5252}
5253
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 Py_ssize_t size,
5256 PyObject *mapping,
5257 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005259 /* output object */
5260 PyObject *res = NULL;
5261 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005262 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005264 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 PyObject *errorHandler = NULL;
5266 PyObject *exc = NULL;
5267 /* the following variable is used for caching string comparisons
5268 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5269 * 3=ignore, 4=xmlcharrefreplace */
5270 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271
5272 /* Default to Latin-1 */
5273 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005276 /* allocate enough for a simple encoding without
5277 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005278 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 if (res == NULL)
5280 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005281 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005284 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 /* try to encode it */
5286 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5287 if (x==enc_EXCEPTION) /* error */
5288 goto onError;
5289 if (x==enc_FAILED) { /* unencodable character */
5290 if (charmap_encoding_error(p, size, &inpos, mapping,
5291 &exc,
5292 &known_errorHandler, &errorHandler, errors,
5293 &res, &respos)) {
5294 goto onError;
5295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005296 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 else
5298 /* done with this character => adjust input position */
5299 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005302 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005303 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005304 if (_PyBytes_Resize(&res, respos) < 0)
5305 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005307 Py_XDECREF(exc);
5308 Py_XDECREF(errorHandler);
5309 return res;
5310
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005312 Py_XDECREF(res);
5313 Py_XDECREF(exc);
5314 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 return NULL;
5316}
5317
5318PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320{
5321 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 PyErr_BadArgument();
5323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 }
5325 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 PyUnicode_GET_SIZE(unicode),
5327 mapping,
5328 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329}
5330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331/* create or adjust a UnicodeTranslateError */
5332static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 const Py_UNICODE *unicode, Py_ssize_t size,
5334 Py_ssize_t startpos, Py_ssize_t endpos,
5335 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005338 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 }
5341 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5343 goto onError;
5344 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5345 goto onError;
5346 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5347 goto onError;
5348 return;
5349 onError:
5350 Py_DECREF(*exceptionObject);
5351 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 }
5353}
5354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355/* raises a UnicodeTranslateError */
5356static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 const Py_UNICODE *unicode, Py_ssize_t size,
5358 Py_ssize_t startpos, Py_ssize_t endpos,
5359 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360{
5361 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365}
5366
5367/* error handling callback helper:
5368 build arguments, call the callback and check the arguments,
5369 put the result into newpos and return the replacement string, which
5370 has to be freed by the caller */
5371static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 PyObject **errorHandler,
5373 const char *reason,
5374 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5375 Py_ssize_t startpos, Py_ssize_t endpos,
5376 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005378 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005380 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005381 PyObject *restuple;
5382 PyObject *resunicode;
5383
5384 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 }
5389
5390 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394
5395 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005400 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 Py_DECREF(restuple);
5402 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 }
5404 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 &resunicode, &i_newpos)) {
5406 Py_DECREF(restuple);
5407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005408 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005409 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005411 else
5412 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005413 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5415 Py_DECREF(restuple);
5416 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005418 Py_INCREF(resunicode);
5419 Py_DECREF(restuple);
5420 return resunicode;
5421}
5422
5423/* Lookup the character ch in the mapping and put the result in result,
5424 which must be decrefed by the caller.
5425 Return 0 on success, -1 on error */
5426static
5427int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5428{
Christian Heimes217cfd12007-12-02 14:31:20 +00005429 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005430 PyObject *x;
5431
5432 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 x = PyObject_GetItem(mapping, w);
5435 Py_DECREF(w);
5436 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5438 /* No mapping found means: use 1:1 mapping. */
5439 PyErr_Clear();
5440 *result = NULL;
5441 return 0;
5442 } else
5443 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 }
5445 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 *result = x;
5447 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005448 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005449 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 long value = PyLong_AS_LONG(x);
5451 long max = PyUnicode_GetMax();
5452 if (value < 0 || value > max) {
5453 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005454 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 Py_DECREF(x);
5456 return -1;
5457 }
5458 *result = x;
5459 return 0;
5460 }
5461 else if (PyUnicode_Check(x)) {
5462 *result = x;
5463 return 0;
5464 }
5465 else {
5466 /* wrong return value */
5467 PyErr_SetString(PyExc_TypeError,
5468 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005469 Py_DECREF(x);
5470 return -1;
5471 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472}
5473/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 if not reallocate and adjust various state variables.
5475 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476static
Walter Dörwald4894c302003-10-24 14:25:28 +00005477int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005480 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005481 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 /* remember old output position */
5483 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5484 /* exponentially overallocate to minimize reallocations */
5485 if (requiredsize < 2 * oldsize)
5486 requiredsize = 2 * oldsize;
5487 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5488 return -1;
5489 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490 }
5491 return 0;
5492}
5493/* lookup the character, put the result in the output string and adjust
5494 various state variables. Return a new reference to the object that
5495 was put in the output buffer in *result, or Py_None, if the mapping was
5496 undefined (in which case no character was written).
5497 The called must decref result.
5498 Return 0 on success, -1 on error. */
5499static
Walter Dörwald4894c302003-10-24 14:25:28 +00005500int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5502 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503{
Walter Dörwald4894c302003-10-24 14:25:28 +00005504 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 /* not found => default to 1:1 mapping */
5508 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 }
5510 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005512 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 /* no overflow check, because we know that the space is enough */
5514 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 }
5516 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5518 if (repsize==1) {
5519 /* no overflow check, because we know that the space is enough */
5520 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5521 }
5522 else if (repsize!=0) {
5523 /* more than one character */
5524 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5525 (insize - (curinp-startinp)) +
5526 repsize - 1;
5527 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5528 return -1;
5529 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5530 *outp += repsize;
5531 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005532 }
5533 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 return 0;
5536}
5537
5538PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 Py_ssize_t size,
5540 PyObject *mapping,
5541 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 /* output object */
5544 PyObject *res = NULL;
5545 /* pointers to the beginning and end+1 of input */
5546 const Py_UNICODE *startp = p;
5547 const Py_UNICODE *endp = p + size;
5548 /* pointer into the output */
5549 Py_UNICODE *str;
5550 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005551 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552 char *reason = "character maps to <undefined>";
5553 PyObject *errorHandler = NULL;
5554 PyObject *exc = NULL;
5555 /* the following variable is used for caching string comparisons
5556 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5557 * 3=ignore, 4=xmlcharrefreplace */
5558 int known_errorHandler = -1;
5559
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 PyErr_BadArgument();
5562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564
5565 /* allocate enough for a simple 1:1 translation without
5566 replacements, if we need more, we'll resize */
5567 res = PyUnicode_FromUnicode(NULL, size);
5568 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 /* try to encode it */
5576 PyObject *x = NULL;
5577 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5578 Py_XDECREF(x);
5579 goto onError;
5580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005581 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 if (x!=Py_None) /* it worked => adjust input pointer */
5583 ++p;
5584 else { /* untranslatable character */
5585 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5586 Py_ssize_t repsize;
5587 Py_ssize_t newpos;
5588 Py_UNICODE *uni2;
5589 /* startpos for collecting untranslatable chars */
5590 const Py_UNICODE *collstart = p;
5591 const Py_UNICODE *collend = p+1;
5592 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 /* find all untranslatable characters */
5595 while (collend < endp) {
5596 if (charmaptranslate_lookup(*collend, mapping, &x))
5597 goto onError;
5598 Py_XDECREF(x);
5599 if (x!=Py_None)
5600 break;
5601 ++collend;
5602 }
5603 /* cache callback name lookup
5604 * (if not done yet, i.e. it's the first error) */
5605 if (known_errorHandler==-1) {
5606 if ((errors==NULL) || (!strcmp(errors, "strict")))
5607 known_errorHandler = 1;
5608 else if (!strcmp(errors, "replace"))
5609 known_errorHandler = 2;
5610 else if (!strcmp(errors, "ignore"))
5611 known_errorHandler = 3;
5612 else if (!strcmp(errors, "xmlcharrefreplace"))
5613 known_errorHandler = 4;
5614 else
5615 known_errorHandler = 0;
5616 }
5617 switch (known_errorHandler) {
5618 case 1: /* strict */
5619 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005620 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 case 2: /* replace */
5622 /* No need to check for space, this is a 1:1 replacement */
5623 for (coll = collstart; coll<collend; ++coll)
5624 *str++ = '?';
5625 /* fall through */
5626 case 3: /* ignore */
5627 p = collend;
5628 break;
5629 case 4: /* xmlcharrefreplace */
5630 /* generate replacement (temporarily (mis)uses p) */
5631 for (p = collstart; p < collend; ++p) {
5632 char buffer[2+29+1+1];
5633 char *cp;
5634 sprintf(buffer, "&#%d;", (int)*p);
5635 if (charmaptranslate_makespace(&res, &str,
5636 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5637 goto onError;
5638 for (cp = buffer; *cp; ++cp)
5639 *str++ = *cp;
5640 }
5641 p = collend;
5642 break;
5643 default:
5644 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5645 reason, startp, size, &exc,
5646 collstart-startp, collend-startp, &newpos);
5647 if (repunicode == NULL)
5648 goto onError;
5649 /* generate replacement */
5650 repsize = PyUnicode_GET_SIZE(repunicode);
5651 if (charmaptranslate_makespace(&res, &str,
5652 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5653 Py_DECREF(repunicode);
5654 goto onError;
5655 }
5656 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5657 *str++ = *uni2;
5658 p = startp + newpos;
5659 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005660 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005661 }
5662 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 /* Resize if we allocated to much */
5664 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005665 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 if (PyUnicode_Resize(&res, respos) < 0)
5667 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 }
5669 Py_XDECREF(exc);
5670 Py_XDECREF(errorHandler);
5671 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 Py_XDECREF(res);
5675 Py_XDECREF(exc);
5676 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 return NULL;
5678}
5679
5680PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 PyObject *mapping,
5682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683{
5684 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005685
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 str = PyUnicode_FromObject(str);
5687 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 PyUnicode_GET_SIZE(str),
5691 mapping,
5692 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 Py_DECREF(str);
5694 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005695
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 Py_XDECREF(str);
5698 return NULL;
5699}
Tim Petersced69f82003-09-16 20:30:58 +00005700
Guido van Rossum9e896b32000-04-05 20:11:21 +00005701/* --- Decimal Encoder ---------------------------------------------------- */
5702
5703int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 Py_ssize_t length,
5705 char *output,
5706 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005707{
5708 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 PyObject *errorHandler = NULL;
5710 PyObject *exc = NULL;
5711 const char *encoding = "decimal";
5712 const char *reason = "invalid decimal Unicode string";
5713 /* the following variable is used for caching string comparisons
5714 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5715 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005716
5717 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 PyErr_BadArgument();
5719 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005720 }
5721
5722 p = s;
5723 end = s + length;
5724 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 register Py_UNICODE ch = *p;
5726 int decimal;
5727 PyObject *repunicode;
5728 Py_ssize_t repsize;
5729 Py_ssize_t newpos;
5730 Py_UNICODE *uni2;
5731 Py_UNICODE *collstart;
5732 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005733
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005735 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 ++p;
5737 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005738 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 decimal = Py_UNICODE_TODECIMAL(ch);
5740 if (decimal >= 0) {
5741 *output++ = '0' + decimal;
5742 ++p;
5743 continue;
5744 }
5745 if (0 < ch && ch < 256) {
5746 *output++ = (char)ch;
5747 ++p;
5748 continue;
5749 }
5750 /* All other characters are considered unencodable */
5751 collstart = p;
5752 collend = p+1;
5753 while (collend < end) {
5754 if ((0 < *collend && *collend < 256) ||
5755 !Py_UNICODE_ISSPACE(*collend) ||
5756 Py_UNICODE_TODECIMAL(*collend))
5757 break;
5758 }
5759 /* cache callback name lookup
5760 * (if not done yet, i.e. it's the first error) */
5761 if (known_errorHandler==-1) {
5762 if ((errors==NULL) || (!strcmp(errors, "strict")))
5763 known_errorHandler = 1;
5764 else if (!strcmp(errors, "replace"))
5765 known_errorHandler = 2;
5766 else if (!strcmp(errors, "ignore"))
5767 known_errorHandler = 3;
5768 else if (!strcmp(errors, "xmlcharrefreplace"))
5769 known_errorHandler = 4;
5770 else
5771 known_errorHandler = 0;
5772 }
5773 switch (known_errorHandler) {
5774 case 1: /* strict */
5775 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5776 goto onError;
5777 case 2: /* replace */
5778 for (p = collstart; p < collend; ++p)
5779 *output++ = '?';
5780 /* fall through */
5781 case 3: /* ignore */
5782 p = collend;
5783 break;
5784 case 4: /* xmlcharrefreplace */
5785 /* generate replacement (temporarily (mis)uses p) */
5786 for (p = collstart; p < collend; ++p)
5787 output += sprintf(output, "&#%d;", (int)*p);
5788 p = collend;
5789 break;
5790 default:
5791 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5792 encoding, reason, s, length, &exc,
5793 collstart-s, collend-s, &newpos);
5794 if (repunicode == NULL)
5795 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005796 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005797 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005798 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5799 Py_DECREF(repunicode);
5800 goto onError;
5801 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 /* generate replacement */
5803 repsize = PyUnicode_GET_SIZE(repunicode);
5804 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5805 Py_UNICODE ch = *uni2;
5806 if (Py_UNICODE_ISSPACE(ch))
5807 *output++ = ' ';
5808 else {
5809 decimal = Py_UNICODE_TODECIMAL(ch);
5810 if (decimal >= 0)
5811 *output++ = '0' + decimal;
5812 else if (0 < ch && ch < 256)
5813 *output++ = (char)ch;
5814 else {
5815 Py_DECREF(repunicode);
5816 raise_encode_exception(&exc, encoding,
5817 s, length, collstart-s, collend-s, reason);
5818 goto onError;
5819 }
5820 }
5821 }
5822 p = s + newpos;
5823 Py_DECREF(repunicode);
5824 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005825 }
5826 /* 0-terminate the output string */
5827 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005828 Py_XDECREF(exc);
5829 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005830 return 0;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 Py_XDECREF(exc);
5834 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005835 return -1;
5836}
5837
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838/* --- Helpers ------------------------------------------------------------ */
5839
Eric Smith8c663262007-08-25 02:26:07 +00005840#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005841#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005842#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005843/* Include _ParseTupleFinds from find.h */
5844#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005845#include "stringlib/find.h"
5846#include "stringlib/partition.h"
5847
Eric Smith5807c412008-05-11 21:00:57 +00005848#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005849#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005850#include "stringlib/localeutil.h"
5851
Thomas Wouters477c8d52006-05-27 19:21:47 +00005852/* helper macro to fixup start/end slice values */
5853#define FIX_START_END(obj) \
5854 if (start < 0) \
5855 start += (obj)->length; \
5856 if (start < 0) \
5857 start = 0; \
5858 if (end > (obj)->length) \
5859 end = (obj)->length; \
5860 if (end < 0) \
5861 end += (obj)->length; \
5862 if (end < 0) \
5863 end = 0;
5864
Martin v. Löwis18e16552006-02-15 17:27:45 +00005865Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005866 PyObject *substr,
5867 Py_ssize_t start,
5868 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005870 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005871 PyUnicodeObject* str_obj;
5872 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005873
Thomas Wouters477c8d52006-05-27 19:21:47 +00005874 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5875 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5878 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 Py_DECREF(str_obj);
5880 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 }
Tim Petersced69f82003-09-16 20:30:58 +00005882
Thomas Wouters477c8d52006-05-27 19:21:47 +00005883 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005884
Thomas Wouters477c8d52006-05-27 19:21:47 +00005885 result = stringlib_count(
5886 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5887 );
5888
5889 Py_DECREF(sub_obj);
5890 Py_DECREF(str_obj);
5891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 return result;
5893}
5894
Martin v. Löwis18e16552006-02-15 17:27:45 +00005895Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005896 PyObject *sub,
5897 Py_ssize_t start,
5898 Py_ssize_t end,
5899 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005901 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005904 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005906 sub = PyUnicode_FromObject(sub);
5907 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 Py_DECREF(str);
5909 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 }
Tim Petersced69f82003-09-16 20:30:58 +00005911
Thomas Wouters477c8d52006-05-27 19:21:47 +00005912 if (direction > 0)
5913 result = stringlib_find_slice(
5914 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5915 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5916 start, end
5917 );
5918 else
5919 result = stringlib_rfind_slice(
5920 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5921 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5922 start, end
5923 );
5924
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005926 Py_DECREF(sub);
5927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 return result;
5929}
5930
Tim Petersced69f82003-09-16 20:30:58 +00005931static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 PyUnicodeObject *substring,
5934 Py_ssize_t start,
5935 Py_ssize_t end,
5936 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 if (substring->length == 0)
5939 return 1;
5940
Thomas Wouters477c8d52006-05-27 19:21:47 +00005941 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942
5943 end -= substring->length;
5944 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946
5947 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 if (Py_UNICODE_MATCH(self, end, substring))
5949 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 } else {
5951 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 }
5954
5955 return 0;
5956}
5957
Martin v. Löwis18e16552006-02-15 17:27:45 +00005958Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 PyObject *substr,
5960 Py_ssize_t start,
5961 Py_ssize_t end,
5962 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005964 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005965
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 str = PyUnicode_FromObject(str);
5967 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 substr = PyUnicode_FromObject(substr);
5970 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 Py_DECREF(str);
5972 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 }
Tim Petersced69f82003-09-16 20:30:58 +00005974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 (PyUnicodeObject *)substr,
5977 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 Py_DECREF(str);
5979 Py_DECREF(substr);
5980 return result;
5981}
5982
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983/* Apply fixfct filter to the Unicode object self and return a
5984 reference to the modified object */
5985
Tim Petersced69f82003-09-16 20:30:58 +00005986static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
5990
5991 PyUnicodeObject *u;
5992
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005996
5997 Py_UNICODE_COPY(u->str, self->str, self->length);
5998
Tim Peters7a29bd52001-09-12 03:03:31 +00005999 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 /* fixfct should return TRUE if it modified the buffer. If
6001 FALSE, return a reference to the original buffer instead
6002 (to save space, not time) */
6003 Py_INCREF(self);
6004 Py_DECREF(u);
6005 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
6007 return (PyObject*) u;
6008}
6009
Tim Petersced69f82003-09-16 20:30:58 +00006010static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011int fixupper(PyUnicodeObject *self)
6012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006013 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 Py_UNICODE *s = self->str;
6015 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006016
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006019
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 ch = Py_UNICODE_TOUPPER(*s);
6021 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 *s = ch;
6024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 s++;
6026 }
6027
6028 return status;
6029}
6030
Tim Petersced69f82003-09-16 20:30:58 +00006031static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032int fixlower(PyUnicodeObject *self)
6033{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006034 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 Py_UNICODE *s = self->str;
6036 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006037
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006040
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 ch = Py_UNICODE_TOLOWER(*s);
6042 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 *s = ch;
6045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 s++;
6047 }
6048
6049 return status;
6050}
6051
Tim Petersced69f82003-09-16 20:30:58 +00006052static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053int fixswapcase(PyUnicodeObject *self)
6054{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006055 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 Py_UNICODE *s = self->str;
6057 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 while (len-- > 0) {
6060 if (Py_UNICODE_ISUPPER(*s)) {
6061 *s = Py_UNICODE_TOLOWER(*s);
6062 status = 1;
6063 } else if (Py_UNICODE_ISLOWER(*s)) {
6064 *s = Py_UNICODE_TOUPPER(*s);
6065 status = 1;
6066 }
6067 s++;
6068 }
6069
6070 return status;
6071}
6072
Tim Petersced69f82003-09-16 20:30:58 +00006073static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074int fixcapitalize(PyUnicodeObject *self)
6075{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006076 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006077 Py_UNICODE *s = self->str;
6078 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006079
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006080 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006082 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 *s = Py_UNICODE_TOUPPER(*s);
6084 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006086 s++;
6087 while (--len > 0) {
6088 if (Py_UNICODE_ISUPPER(*s)) {
6089 *s = Py_UNICODE_TOLOWER(*s);
6090 status = 1;
6091 }
6092 s++;
6093 }
6094 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095}
6096
6097static
6098int fixtitle(PyUnicodeObject *self)
6099{
6100 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6101 register Py_UNICODE *e;
6102 int previous_is_cased;
6103
6104 /* Shortcut for single character strings */
6105 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6107 if (*p != ch) {
6108 *p = ch;
6109 return 1;
6110 }
6111 else
6112 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 }
Tim Petersced69f82003-09-16 20:30:58 +00006114
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 e = p + PyUnicode_GET_SIZE(self);
6116 previous_is_cased = 0;
6117 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006119
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 if (previous_is_cased)
6121 *p = Py_UNICODE_TOLOWER(ch);
6122 else
6123 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006124
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 if (Py_UNICODE_ISLOWER(ch) ||
6126 Py_UNICODE_ISUPPER(ch) ||
6127 Py_UNICODE_ISTITLE(ch))
6128 previous_is_cased = 1;
6129 else
6130 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
6132 return 1;
6133}
6134
Tim Peters8ce9f162004-08-27 01:49:32 +00006135PyObject *
6136PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137{
Skip Montanaro6543b452004-09-16 03:28:13 +00006138 const Py_UNICODE blank = ' ';
6139 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006140 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006141 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006142 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6143 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006144 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6145 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006146 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006147 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Tim Peters05eba1f2004-08-27 21:32:02 +00006149 fseq = PySequence_Fast(seq, "");
6150 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006151 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006152 }
6153
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006154 /* NOTE: the following code can't call back into Python code,
6155 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006156 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006157
Tim Peters05eba1f2004-08-27 21:32:02 +00006158 seqlen = PySequence_Fast_GET_SIZE(fseq);
6159 /* If empty sequence, return u"". */
6160 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006161 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6162 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006163 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006164 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006165 /* If singleton sequence with an exact Unicode, return that. */
6166 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 item = items[0];
6168 if (PyUnicode_CheckExact(item)) {
6169 Py_INCREF(item);
6170 res = (PyUnicodeObject *)item;
6171 goto Done;
6172 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006173 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006174 else {
6175 /* Set up sep and seplen */
6176 if (separator == NULL) {
6177 sep = &blank;
6178 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006179 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006180 else {
6181 if (!PyUnicode_Check(separator)) {
6182 PyErr_Format(PyExc_TypeError,
6183 "separator: expected str instance,"
6184 " %.80s found",
6185 Py_TYPE(separator)->tp_name);
6186 goto onError;
6187 }
6188 sep = PyUnicode_AS_UNICODE(separator);
6189 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006190 }
6191 }
6192
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006193 /* There are at least two things to join, or else we have a subclass
6194 * of str in the sequence.
6195 * Do a pre-pass to figure out the total amount of space we'll
6196 * need (sz), and see whether all argument are strings.
6197 */
6198 sz = 0;
6199 for (i = 0; i < seqlen; i++) {
6200 const Py_ssize_t old_sz = sz;
6201 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 if (!PyUnicode_Check(item)) {
6203 PyErr_Format(PyExc_TypeError,
6204 "sequence item %zd: expected str instance,"
6205 " %.80s found",
6206 i, Py_TYPE(item)->tp_name);
6207 goto onError;
6208 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006209 sz += PyUnicode_GET_SIZE(item);
6210 if (i != 0)
6211 sz += seplen;
6212 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6213 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006215 goto onError;
6216 }
6217 }
Tim Petersced69f82003-09-16 20:30:58 +00006218
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006219 res = _PyUnicode_New(sz);
6220 if (res == NULL)
6221 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006222
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006223 /* Catenate everything. */
6224 res_p = PyUnicode_AS_UNICODE(res);
6225 for (i = 0; i < seqlen; ++i) {
6226 Py_ssize_t itemlen;
6227 item = items[i];
6228 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 /* Copy item, and maybe the separator. */
6230 if (i) {
6231 Py_UNICODE_COPY(res_p, sep, seplen);
6232 res_p += seplen;
6233 }
6234 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6235 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006236 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006237
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006239 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 return (PyObject *)res;
6241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006243 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006244 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 return NULL;
6246}
6247
Tim Petersced69f82003-09-16 20:30:58 +00006248static
6249PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 Py_ssize_t left,
6251 Py_ssize_t right,
6252 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253{
6254 PyUnicodeObject *u;
6255
6256 if (left < 0)
6257 left = 0;
6258 if (right < 0)
6259 right = 0;
6260
Tim Peters7a29bd52001-09-12 03:03:31 +00006261 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 Py_INCREF(self);
6263 return self;
6264 }
6265
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006266 if (left > PY_SSIZE_T_MAX - self->length ||
6267 right > PY_SSIZE_T_MAX - (left + self->length)) {
6268 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6269 return NULL;
6270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 u = _PyUnicode_New(left + self->length + right);
6272 if (u) {
6273 if (left)
6274 Py_UNICODE_FILL(u->str, fill, left);
6275 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6276 if (right)
6277 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6278 }
6279
6280 return u;
6281}
6282
Benjamin Peterson29060642009-01-31 22:14:21 +00006283#define SPLIT_APPEND(data, left, right) \
6284 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6285 if (!str) \
6286 goto onError; \
6287 if (PyList_Append(list, str)) { \
6288 Py_DECREF(str); \
6289 goto onError; \
6290 } \
6291 else \
6292 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293
6294static
6295PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 PyObject *list,
6297 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006299 register Py_ssize_t i;
6300 register Py_ssize_t j;
6301 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006303 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304
6305 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006307 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006309 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6311 i++;
6312 if (j < i) {
6313 if (maxcount-- <= 0)
6314 break;
6315 SPLIT_APPEND(buf, j, i);
6316 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6317 i++;
6318 j = i;
6319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 }
6321 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
6324 return list;
6325
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 Py_DECREF(list);
6328 return NULL;
6329}
6330
6331PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006334 register Py_ssize_t i;
6335 register Py_ssize_t j;
6336 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 PyObject *list;
6338 PyObject *str;
6339 Py_UNICODE *data;
6340
6341 string = PyUnicode_FromObject(string);
6342 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 data = PyUnicode_AS_UNICODE(string);
6345 len = PyUnicode_GET_SIZE(string);
6346
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 list = PyList_New(0);
6348 if (!list)
6349 goto onError;
6350
6351 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006353
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 /* Find a line and append it */
6355 while (i < len && !BLOOM_LINEBREAK(data[i]))
6356 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006359 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 if (i < len) {
6361 if (data[i] == '\r' && i + 1 < len &&
6362 data[i+1] == '\n')
6363 i += 2;
6364 else
6365 i++;
6366 if (keepends)
6367 eol = i;
6368 }
6369 SPLIT_APPEND(data, j, eol);
6370 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 }
6372 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 }
6375
6376 Py_DECREF(string);
6377 return list;
6378
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006380 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 Py_DECREF(string);
6382 return NULL;
6383}
6384
Tim Petersced69f82003-09-16 20:30:58 +00006385static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 PyObject *list,
6388 Py_UNICODE ch,
6389 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006391 register Py_ssize_t i;
6392 register Py_ssize_t j;
6393 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006395 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396
6397 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 if (buf[i] == ch) {
6399 if (maxcount-- <= 0)
6400 break;
6401 SPLIT_APPEND(buf, j, i);
6402 i = j = i + 1;
6403 } else
6404 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 }
6406 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 }
6409 return list;
6410
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 Py_DECREF(list);
6413 return NULL;
6414}
6415
Tim Petersced69f82003-09-16 20:30:58 +00006416static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 PyObject *list,
6419 PyUnicodeObject *substring,
6420 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006422 register Py_ssize_t i;
6423 register Py_ssize_t j;
6424 Py_ssize_t len = self->length;
6425 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 PyObject *str;
6427
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006428 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 if (Py_UNICODE_MATCH(self, i, substring)) {
6430 if (maxcount-- <= 0)
6431 break;
6432 SPLIT_APPEND(self->str, j, i);
6433 i = j = i + sublen;
6434 } else
6435 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 }
6437 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 }
6440 return list;
6441
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 Py_DECREF(list);
6444 return NULL;
6445}
6446
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006447static
6448PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 PyObject *list,
6450 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006451{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 register Py_ssize_t i;
6453 register Py_ssize_t j;
6454 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006455 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006456 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006457
6458 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006460 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006462 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6464 i--;
6465 if (j > i) {
6466 if (maxcount-- <= 0)
6467 break;
6468 SPLIT_APPEND(buf, i + 1, j + 1);
6469 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6470 i--;
6471 j = i;
6472 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006473 }
6474 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006476 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 if (PyList_Reverse(list) < 0)
6478 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006479 return list;
6480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006482 Py_DECREF(list);
6483 return NULL;
6484}
6485
Benjamin Peterson14339b62009-01-31 16:36:08 +00006486static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006487PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 PyObject *list,
6489 Py_UNICODE ch,
6490 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006491{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006492 register Py_ssize_t i;
6493 register Py_ssize_t j;
6494 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006495 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006496 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006497
6498 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 if (buf[i] == ch) {
6500 if (maxcount-- <= 0)
6501 break;
6502 SPLIT_APPEND(buf, i + 1, j + 1);
6503 j = i = i - 1;
6504 } else
6505 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006506 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006507 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006509 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006510 if (PyList_Reverse(list) < 0)
6511 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006512 return list;
6513
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006515 Py_DECREF(list);
6516 return NULL;
6517}
6518
Benjamin Peterson14339b62009-01-31 16:36:08 +00006519static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006520PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 PyObject *list,
6522 PyUnicodeObject *substring,
6523 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006524{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006525 register Py_ssize_t i;
6526 register Py_ssize_t j;
6527 Py_ssize_t len = self->length;
6528 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006529 PyObject *str;
6530
6531 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 if (Py_UNICODE_MATCH(self, i, substring)) {
6533 if (maxcount-- <= 0)
6534 break;
6535 SPLIT_APPEND(self->str, i + sublen, j);
6536 j = i;
6537 i -= sublen;
6538 } else
6539 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006540 }
6541 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006543 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006544 if (PyList_Reverse(list) < 0)
6545 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006546 return list;
6547
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006549 Py_DECREF(list);
6550 return NULL;
6551}
6552
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553#undef SPLIT_APPEND
6554
6555static
6556PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 PyUnicodeObject *substring,
6558 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
6560 PyObject *list;
6561
6562 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006563 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564
6565 list = PyList_New(0);
6566 if (!list)
6567 return NULL;
6568
6569 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571
6572 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
6575 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 Py_DECREF(list);
6577 PyErr_SetString(PyExc_ValueError, "empty separator");
6578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 }
6580 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582}
6583
Tim Petersced69f82003-09-16 20:30:58 +00006584static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006585PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 PyUnicodeObject *substring,
6587 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006588{
6589 PyObject *list;
6590
6591 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006592 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006593
6594 list = PyList_New(0);
6595 if (!list)
6596 return NULL;
6597
6598 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006600
6601 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006603
6604 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 Py_DECREF(list);
6606 PyErr_SetString(PyExc_ValueError, "empty separator");
6607 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006608 }
6609 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006611}
6612
6613static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 PyUnicodeObject *str1,
6616 PyUnicodeObject *str2,
6617 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618{
6619 PyUnicodeObject *u;
6620
6621 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623
Thomas Wouters477c8d52006-05-27 19:21:47 +00006624 if (str1->length == str2->length) {
6625 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006626 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006627 if (str1->length == 1) {
6628 /* replace characters */
6629 Py_UNICODE u1, u2;
6630 if (!findchar(self->str, self->length, str1->str[0]))
6631 goto nothing;
6632 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6633 if (!u)
6634 return NULL;
6635 Py_UNICODE_COPY(u->str, self->str, self->length);
6636 u1 = str1->str[0];
6637 u2 = str2->str[0];
6638 for (i = 0; i < u->length; i++)
6639 if (u->str[i] == u1) {
6640 if (--maxcount < 0)
6641 break;
6642 u->str[i] = u2;
6643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006645 i = fastsearch(
6646 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006648 if (i < 0)
6649 goto nothing;
6650 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6651 if (!u)
6652 return NULL;
6653 Py_UNICODE_COPY(u->str, self->str, self->length);
6654 while (i <= self->length - str1->length)
6655 if (Py_UNICODE_MATCH(self, i, str1)) {
6656 if (--maxcount < 0)
6657 break;
6658 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6659 i += str1->length;
6660 } else
6661 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006664
6665 Py_ssize_t n, i, j, e;
6666 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 Py_UNICODE *p;
6668
6669 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006670 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 if (n > maxcount)
6672 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006673 if (n == 0)
6674 goto nothing;
6675 /* new_size = self->length + n * (str2->length - str1->length)); */
6676 delta = (str2->length - str1->length);
6677 if (delta == 0) {
6678 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006680 product = n * (str2->length - str1->length);
6681 if ((product / (str2->length - str1->length)) != n) {
6682 PyErr_SetString(PyExc_OverflowError,
6683 "replace string is too long");
6684 return NULL;
6685 }
6686 new_size = self->length + product;
6687 if (new_size < 0) {
6688 PyErr_SetString(PyExc_OverflowError,
6689 "replace string is too long");
6690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 }
6692 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006693 u = _PyUnicode_New(new_size);
6694 if (!u)
6695 return NULL;
6696 i = 0;
6697 p = u->str;
6698 e = self->length - str1->length;
6699 if (str1->length > 0) {
6700 while (n-- > 0) {
6701 /* look for next match */
6702 j = i;
6703 while (j <= e) {
6704 if (Py_UNICODE_MATCH(self, j, str1))
6705 break;
6706 j++;
6707 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006709 if (j > e)
6710 break;
6711 /* copy unchanged part [i:j] */
6712 Py_UNICODE_COPY(p, self->str+i, j-i);
6713 p += j - i;
6714 }
6715 /* copy substitution string */
6716 if (str2->length > 0) {
6717 Py_UNICODE_COPY(p, str2->str, str2->length);
6718 p += str2->length;
6719 }
6720 i = j + str1->length;
6721 }
6722 if (i < self->length)
6723 /* copy tail [i:] */
6724 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6725 } else {
6726 /* interleave */
6727 while (n > 0) {
6728 Py_UNICODE_COPY(p, str2->str, str2->length);
6729 p += str2->length;
6730 if (--n <= 0)
6731 break;
6732 *p++ = self->str[i++];
6733 }
6734 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006738
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006740 /* nothing to replace; return original string (when possible) */
6741 if (PyUnicode_CheckExact(self)) {
6742 Py_INCREF(self);
6743 return (PyObject *) self;
6744 }
6745 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746}
6747
6748/* --- Unicode Object Methods --------------------------------------------- */
6749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752\n\
6753Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006754characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
6756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006757unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 return fixup(self, fixtitle);
6760}
6761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006762PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764\n\
6765Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran74ceac22010-07-05 12:04:23 +00006766have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767
6768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006769unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 return fixup(self, fixcapitalize);
6772}
6773
6774#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777\n\
6778Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006779normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
6781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006782unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783{
6784 PyObject *list;
6785 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006786 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 /* Split into words */
6789 list = split(self, NULL, -1);
6790 if (!list)
6791 return NULL;
6792
6793 /* Capitalize each word */
6794 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6795 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 if (item == NULL)
6798 goto onError;
6799 Py_DECREF(PyList_GET_ITEM(list, i));
6800 PyList_SET_ITEM(list, i, item);
6801 }
6802
6803 /* Join the words to form a new string */
6804 item = PyUnicode_Join(NULL, list);
6805
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 Py_DECREF(list);
6808 return (PyObject *)item;
6809}
6810#endif
6811
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006812/* Argument converter. Coerces to a single unicode character */
6813
6814static int
6815convert_uc(PyObject *obj, void *addr)
6816{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006817 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6818 PyObject *uniobj;
6819 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006820
Benjamin Peterson14339b62009-01-31 16:36:08 +00006821 uniobj = PyUnicode_FromObject(obj);
6822 if (uniobj == NULL) {
6823 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006825 return 0;
6826 }
6827 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6828 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006830 Py_DECREF(uniobj);
6831 return 0;
6832 }
6833 unistr = PyUnicode_AS_UNICODE(uniobj);
6834 *fillcharloc = unistr[0];
6835 Py_DECREF(uniobj);
6836 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006837}
6838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006842Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006843done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
6845static PyObject *
6846unicode_center(PyUnicodeObject *self, PyObject *args)
6847{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006848 Py_ssize_t marg, left;
6849 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006850 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
Thomas Woutersde017742006-02-16 19:34:37 +00006852 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 return NULL;
6854
Tim Peters7a29bd52001-09-12 03:03:31 +00006855 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 Py_INCREF(self);
6857 return (PyObject*) self;
6858 }
6859
6860 marg = width - self->length;
6861 left = marg / 2 + (marg & width & 1);
6862
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006863 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864}
6865
Marc-André Lemburge5034372000-08-08 08:04:29 +00006866#if 0
6867
6868/* This code should go into some future Unicode collation support
6869 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006870 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006871
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006872/* speedy UTF-16 code point order comparison */
6873/* gleaned from: */
6874/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6875
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006876static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006877{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006878 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006879 0, 0, 0, 0, 0, 0, 0, 0,
6880 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006881 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006882};
6883
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884static int
6885unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6886{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006887 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 Py_UNICODE *s1 = str1->str;
6890 Py_UNICODE *s2 = str2->str;
6891
6892 len1 = str1->length;
6893 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006896 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006897
6898 c1 = *s1++;
6899 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006900
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 if (c1 > (1<<11) * 26)
6902 c1 += utf16Fixup[c1>>11];
6903 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006904 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006905 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006906
6907 if (c1 != c2)
6908 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006909
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006910 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
6912
6913 return (len1 < len2) ? -1 : (len1 != len2);
6914}
6915
Marc-André Lemburge5034372000-08-08 08:04:29 +00006916#else
6917
6918static int
6919unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6920{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006921 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006922
6923 Py_UNICODE *s1 = str1->str;
6924 Py_UNICODE *s2 = str2->str;
6925
6926 len1 = str1->length;
6927 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006928
Marc-André Lemburge5034372000-08-08 08:04:29 +00006929 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006930 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006931
Fredrik Lundh45714e92001-06-26 16:39:36 +00006932 c1 = *s1++;
6933 c2 = *s2++;
6934
6935 if (c1 != c2)
6936 return (c1 < c2) ? -1 : 1;
6937
Marc-André Lemburge5034372000-08-08 08:04:29 +00006938 len1--; len2--;
6939 }
6940
6941 return (len1 < len2) ? -1 : (len1 != len2);
6942}
6943
6944#endif
6945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006949 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6950 return unicode_compare((PyUnicodeObject *)left,
6951 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006952 PyErr_Format(PyExc_TypeError,
6953 "Can't compare %.100s and %.100s",
6954 left->ob_type->tp_name,
6955 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 return -1;
6957}
6958
Martin v. Löwis5b222132007-06-10 09:51:05 +00006959int
6960PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6961{
6962 int i;
6963 Py_UNICODE *id;
6964 assert(PyUnicode_Check(uni));
6965 id = PyUnicode_AS_UNICODE(uni);
6966 /* Compare Unicode string and source character set string */
6967 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 if (id[i] != str[i])
6969 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Petersonbb81c8c2010-01-09 21:54:39 +00006970 /* This check keeps Python strings that end in '\0' from comparing equal
6971 to C strings identical up to that point. */
6972 if (PyUnicode_GET_SIZE(uni) != i)
6973 /* We'll say the Python string is longer. */
6974 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006975 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006977 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006979 return 0;
6980}
6981
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006982
Benjamin Peterson29060642009-01-31 22:14:21 +00006983#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006985
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006986PyObject *PyUnicode_RichCompare(PyObject *left,
6987 PyObject *right,
6988 int op)
6989{
6990 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006991
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006992 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6993 PyObject *v;
6994 if (((PyUnicodeObject *) left)->length !=
6995 ((PyUnicodeObject *) right)->length) {
6996 if (op == Py_EQ) {
6997 Py_INCREF(Py_False);
6998 return Py_False;
6999 }
7000 if (op == Py_NE) {
7001 Py_INCREF(Py_True);
7002 return Py_True;
7003 }
7004 }
7005 if (left == right)
7006 result = 0;
7007 else
7008 result = unicode_compare((PyUnicodeObject *)left,
7009 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007010
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007011 /* Convert the return value to a Boolean */
7012 switch (op) {
7013 case Py_EQ:
7014 v = TEST_COND(result == 0);
7015 break;
7016 case Py_NE:
7017 v = TEST_COND(result != 0);
7018 break;
7019 case Py_LE:
7020 v = TEST_COND(result <= 0);
7021 break;
7022 case Py_GE:
7023 v = TEST_COND(result >= 0);
7024 break;
7025 case Py_LT:
7026 v = TEST_COND(result == -1);
7027 break;
7028 case Py_GT:
7029 v = TEST_COND(result == 1);
7030 break;
7031 default:
7032 PyErr_BadArgument();
7033 return NULL;
7034 }
7035 Py_INCREF(v);
7036 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007038
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007039 Py_INCREF(Py_NotImplemented);
7040 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007041}
7042
Guido van Rossum403d68b2000-03-13 15:55:09 +00007043int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007045{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007046 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007047 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007048
7049 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007050 sub = PyUnicode_FromObject(element);
7051 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 PyErr_Format(PyExc_TypeError,
7053 "'in <string>' requires string as left operand, not %s",
7054 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007056 }
7057
Thomas Wouters477c8d52006-05-27 19:21:47 +00007058 str = PyUnicode_FromObject(container);
7059 if (!str) {
7060 Py_DECREF(sub);
7061 return -1;
7062 }
7063
7064 result = stringlib_contains_obj(str, sub);
7065
7066 Py_DECREF(str);
7067 Py_DECREF(sub);
7068
Guido van Rossum403d68b2000-03-13 15:55:09 +00007069 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007070}
7071
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072/* Concat to string or Unicode object giving a new Unicode object. */
7073
7074PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076{
7077 PyUnicodeObject *u = NULL, *v = NULL, *w;
7078
7079 /* Coerce the two arguments */
7080 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7081 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7084 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086
7087 /* Shortcuts */
7088 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 Py_DECREF(v);
7090 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 }
7092 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 Py_DECREF(u);
7094 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 }
7096
7097 /* Concat the two Unicode strings */
7098 w = _PyUnicode_New(u->length + v->length);
7099 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 Py_UNICODE_COPY(w->str, u->str, u->length);
7102 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7103
7104 Py_DECREF(u);
7105 Py_DECREF(v);
7106 return (PyObject *)w;
7107
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 Py_XDECREF(u);
7110 Py_XDECREF(v);
7111 return NULL;
7112}
7113
Walter Dörwald1ab83302007-05-18 17:15:44 +00007114void
7115PyUnicode_Append(PyObject **pleft, PyObject *right)
7116{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007117 PyObject *new;
7118 if (*pleft == NULL)
7119 return;
7120 if (right == NULL || !PyUnicode_Check(*pleft)) {
7121 Py_DECREF(*pleft);
7122 *pleft = NULL;
7123 return;
7124 }
7125 new = PyUnicode_Concat(*pleft, right);
7126 Py_DECREF(*pleft);
7127 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007128}
7129
7130void
7131PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7132{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007133 PyUnicode_Append(pleft, right);
7134 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007135}
7136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007140Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007141string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007142interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
7144static PyObject *
7145unicode_count(PyUnicodeObject *self, PyObject *args)
7146{
7147 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007148 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007149 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 PyObject *result;
7151
Guido van Rossumb8872e62000-05-09 14:14:27 +00007152 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 return NULL;
7155
7156 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007157 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007160
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162
Christian Heimes217cfd12007-12-02 14:31:20 +00007163 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164 stringlib_count(self->str + start, end - start,
7165 substring->str, substring->length)
7166 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167
7168 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007169
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 return result;
7171}
7172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007173PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007176Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007177to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007178handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7180'xmlcharrefreplace' as well as any other name registered with\n\
7181codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182
7183static PyObject *
7184unicode_encode(PyUnicodeObject *self, PyObject *args)
7185{
7186 char *encoding = NULL;
7187 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007188 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007189
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7191 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007192 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007193 if (v == NULL)
7194 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007195 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007196 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007197 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007198 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007199 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007200 Py_DECREF(v);
7201 return NULL;
7202 }
7203 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007204
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007206 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007207}
7208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007209PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211\n\
7212Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007213If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214
7215static PyObject*
7216unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7217{
7218 Py_UNICODE *e;
7219 Py_UNICODE *p;
7220 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007221 Py_UNICODE *qe;
7222 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 PyUnicodeObject *u;
7224 int tabsize = 8;
7225
7226 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228
Thomas Wouters7e474022000-07-16 12:04:32 +00007229 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007230 i = 0; /* chars up to and including most recent \n or \r */
7231 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7232 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 for (p = self->str; p < e; p++)
7234 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 if (tabsize > 0) {
7236 incr = tabsize - (j % tabsize); /* cannot overflow */
7237 if (j > PY_SSIZE_T_MAX - incr)
7238 goto overflow1;
7239 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007240 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 if (j > PY_SSIZE_T_MAX - 1)
7244 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 j++;
7246 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 if (i > PY_SSIZE_T_MAX - j)
7248 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007250 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 }
7252 }
7253
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007254 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007256
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 /* Second pass: create output string and fill it */
7258 u = _PyUnicode_New(i + j);
7259 if (!u)
7260 return NULL;
7261
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007262 j = 0; /* same as in first pass */
7263 q = u->str; /* next output char */
7264 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
7266 for (p = self->str; p < e; p++)
7267 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 if (tabsize > 0) {
7269 i = tabsize - (j % tabsize);
7270 j += i;
7271 while (i--) {
7272 if (q >= qe)
7273 goto overflow2;
7274 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007275 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007277 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 else {
7279 if (q >= qe)
7280 goto overflow2;
7281 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007282 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 if (*p == '\n' || *p == '\r')
7284 j = 0;
7285 }
7286
7287 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007288
7289 overflow2:
7290 Py_DECREF(u);
7291 overflow1:
7292 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294}
7295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007296PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298\n\
7299Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007300such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301arguments start and end are interpreted as in slice notation.\n\
7302\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007303Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304
7305static PyObject *
7306unicode_find(PyUnicodeObject *self, PyObject *args)
7307{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007308 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007309 Py_ssize_t start;
7310 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007311 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
Christian Heimes9cd17752007-11-18 19:35:23 +00007313 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
Thomas Wouters477c8d52006-05-27 19:21:47 +00007316 result = stringlib_find_slice(
7317 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7318 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7319 start, end
7320 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321
7322 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007323
Christian Heimes217cfd12007-12-02 14:31:20 +00007324 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325}
7326
7327static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007328unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329{
7330 if (index < 0 || index >= self->length) {
7331 PyErr_SetString(PyExc_IndexError, "string index out of range");
7332 return NULL;
7333 }
7334
7335 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7336}
7337
Guido van Rossumc2504932007-09-18 19:42:40 +00007338/* Believe it or not, this produces the same value for ASCII strings
7339 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007341unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
Guido van Rossumc2504932007-09-18 19:42:40 +00007343 Py_ssize_t len;
7344 Py_UNICODE *p;
7345 long x;
7346
7347 if (self->hash != -1)
7348 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007349 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007350 p = self->str;
7351 x = *p << 7;
7352 while (--len >= 0)
7353 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007354 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007355 if (x == -1)
7356 x = -2;
7357 self->hash = x;
7358 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359}
7360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007364Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365
7366static PyObject *
7367unicode_index(PyUnicodeObject *self, PyObject *args)
7368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007369 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007370 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007371 Py_ssize_t start;
7372 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Christian Heimes9cd17752007-11-18 19:35:23 +00007374 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
Thomas Wouters477c8d52006-05-27 19:21:47 +00007377 result = stringlib_find_slice(
7378 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7379 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7380 start, end
7381 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382
7383 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007384
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 if (result < 0) {
7386 PyErr_SetString(PyExc_ValueError, "substring not found");
7387 return NULL;
7388 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007389
Christian Heimes217cfd12007-12-02 14:31:20 +00007390 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391}
7392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007396Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007397at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
7399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007400unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401{
7402 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7403 register const Py_UNICODE *e;
7404 int cased;
7405
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 /* Shortcut for single character strings */
7407 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007410 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007411 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007413
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 e = p + PyUnicode_GET_SIZE(self);
7415 cased = 0;
7416 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007418
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7420 return PyBool_FromLong(0);
7421 else if (!cased && Py_UNICODE_ISLOWER(ch))
7422 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007424 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425}
7426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007427PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007430Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007431at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432
7433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007434unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435{
7436 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7437 register const Py_UNICODE *e;
7438 int cased;
7439
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 /* Shortcut for single character strings */
7441 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007444 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007445 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007447
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 e = p + PyUnicode_GET_SIZE(self);
7449 cased = 0;
7450 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007452
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7454 return PyBool_FromLong(0);
7455 else if (!cased && Py_UNICODE_ISUPPER(ch))
7456 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007458 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459}
7460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007461PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007464Return True if S is a titlecased string and there is at least one\n\
7465character in S, i.e. upper- and titlecase characters may only\n\
7466follow uncased characters and lowercase characters only cased ones.\n\
7467Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468
7469static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007470unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471{
7472 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7473 register const Py_UNICODE *e;
7474 int cased, previous_is_cased;
7475
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 /* Shortcut for single character strings */
7477 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7479 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007481 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007482 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007484
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 e = p + PyUnicode_GET_SIZE(self);
7486 cased = 0;
7487 previous_is_cased = 0;
7488 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007490
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7492 if (previous_is_cased)
7493 return PyBool_FromLong(0);
7494 previous_is_cased = 1;
7495 cased = 1;
7496 }
7497 else if (Py_UNICODE_ISLOWER(ch)) {
7498 if (!previous_is_cased)
7499 return PyBool_FromLong(0);
7500 previous_is_cased = 1;
7501 cased = 1;
7502 }
7503 else
7504 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007506 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507}
7508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007509PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007512Return True if all characters in S are whitespace\n\
7513and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514
7515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007516unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517{
7518 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7519 register const Py_UNICODE *e;
7520
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 /* Shortcut for single character strings */
7522 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 Py_UNICODE_ISSPACE(*p))
7524 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007526 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007527 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007529
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 e = p + PyUnicode_GET_SIZE(self);
7531 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 if (!Py_UNICODE_ISSPACE(*p))
7533 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007535 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536}
7537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007540\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007541Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007542and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007543
7544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007545unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007546{
7547 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7548 register const Py_UNICODE *e;
7549
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007550 /* Shortcut for single character strings */
7551 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 Py_UNICODE_ISALPHA(*p))
7553 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007554
7555 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007556 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007558
7559 e = p + PyUnicode_GET_SIZE(self);
7560 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 if (!Py_UNICODE_ISALPHA(*p))
7562 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007564 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007565}
7566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007569\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007570Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007572
7573static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007574unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007575{
7576 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7577 register const Py_UNICODE *e;
7578
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007579 /* Shortcut for single character strings */
7580 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 Py_UNICODE_ISALNUM(*p))
7582 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007583
7584 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007585 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007587
7588 e = p + PyUnicode_GET_SIZE(self);
7589 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 if (!Py_UNICODE_ISALNUM(*p))
7591 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007592 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007593 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007594}
7595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007596PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007599Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007600False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601
7602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007603unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604{
7605 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7606 register const Py_UNICODE *e;
7607
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 /* Shortcut for single character strings */
7609 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 Py_UNICODE_ISDECIMAL(*p))
7611 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007613 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007614 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007616
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 e = p + PyUnicode_GET_SIZE(self);
7618 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 if (!Py_UNICODE_ISDECIMAL(*p))
7620 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007622 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623}
7624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007628Return True if all characters in S are digits\n\
7629and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630
7631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007632unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633{
7634 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7635 register const Py_UNICODE *e;
7636
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 /* Shortcut for single character strings */
7638 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 Py_UNICODE_ISDIGIT(*p))
7640 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007642 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007643 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007645
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 e = p + PyUnicode_GET_SIZE(self);
7647 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 if (!Py_UNICODE_ISDIGIT(*p))
7649 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007651 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652}
7653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007654PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007657Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007658False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659
7660static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007661unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662{
7663 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7664 register const Py_UNICODE *e;
7665
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 /* Shortcut for single character strings */
7667 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 Py_UNICODE_ISNUMERIC(*p))
7669 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007671 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007672 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007674
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 e = p + PyUnicode_GET_SIZE(self);
7676 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 if (!Py_UNICODE_ISNUMERIC(*p))
7678 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007680 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681}
7682
Martin v. Löwis47383402007-08-15 07:32:56 +00007683int
7684PyUnicode_IsIdentifier(PyObject *self)
7685{
7686 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7687 register const Py_UNICODE *e;
7688
7689 /* Special case for empty strings */
7690 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007692
7693 /* PEP 3131 says that the first character must be in
7694 XID_Start and subsequent characters in XID_Continue,
7695 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007696 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007697 letters, digits, underscore). However, given the current
7698 definition of XID_Start and XID_Continue, it is sufficient
7699 to check just for these, except that _ must be allowed
7700 as starting an identifier. */
7701 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7702 return 0;
7703
7704 e = p + PyUnicode_GET_SIZE(self);
7705 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 if (!_PyUnicode_IsXidContinue(*p))
7707 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007708 }
7709 return 1;
7710}
7711
7712PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007714\n\
7715Return True if S is a valid identifier according\n\
7716to the language definition.");
7717
7718static PyObject*
7719unicode_isidentifier(PyObject *self)
7720{
7721 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7722}
7723
Georg Brandl559e5d72008-06-11 18:37:52 +00007724PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007726\n\
7727Return True if all characters in S are considered\n\
7728printable in repr() or S is empty, False otherwise.");
7729
7730static PyObject*
7731unicode_isprintable(PyObject *self)
7732{
7733 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7734 register const Py_UNICODE *e;
7735
7736 /* Shortcut for single character strings */
7737 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7738 Py_RETURN_TRUE;
7739 }
7740
7741 e = p + PyUnicode_GET_SIZE(self);
7742 for (; p < e; p++) {
7743 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7744 Py_RETURN_FALSE;
7745 }
7746 }
7747 Py_RETURN_TRUE;
7748}
7749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750PyDoc_STRVAR(join__doc__,
Georg Brandl628e6f92009-10-27 20:24:45 +00007751 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752\n\
7753Return a string which is the concatenation of the strings in the\n\
Georg Brandl628e6f92009-10-27 20:24:45 +00007754iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007757unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007759 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760}
7761
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763unicode_length(PyUnicodeObject *self)
7764{
7765 return self->length;
7766}
7767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007768PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007771Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007772done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773
7774static PyObject *
7775unicode_ljust(PyUnicodeObject *self, PyObject *args)
7776{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007777 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007778 Py_UNICODE fillchar = ' ';
7779
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007780 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 return NULL;
7782
Tim Peters7a29bd52001-09-12 03:03:31 +00007783 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 Py_INCREF(self);
7785 return (PyObject*) self;
7786 }
7787
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007788 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789}
7790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007791PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007794Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
7796static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007797unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 return fixup(self, fixlower);
7800}
7801
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007802#define LEFTSTRIP 0
7803#define RIGHTSTRIP 1
7804#define BOTHSTRIP 2
7805
7806/* Arrays indexed by above */
7807static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7808
7809#define STRIPNAME(i) (stripformat[i]+3)
7810
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007811/* externally visible for str.strip(unicode) */
7812PyObject *
7813_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7814{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007815 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7816 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7817 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7818 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7819 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007820
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007822
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 i = 0;
7824 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7826 i++;
7827 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007828 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007829
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830 j = len;
7831 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 do {
7833 j--;
7834 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7835 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007836 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007837
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 Py_INCREF(self);
7840 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 }
7842 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007844}
7845
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846
7847static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007848do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7851 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007852
Benjamin Peterson14339b62009-01-31 16:36:08 +00007853 i = 0;
7854 if (striptype != RIGHTSTRIP) {
7855 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7856 i++;
7857 }
7858 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007859
Benjamin Peterson14339b62009-01-31 16:36:08 +00007860 j = len;
7861 if (striptype != LEFTSTRIP) {
7862 do {
7863 j--;
7864 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7865 j++;
7866 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007867
Benjamin Peterson14339b62009-01-31 16:36:08 +00007868 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7869 Py_INCREF(self);
7870 return (PyObject*)self;
7871 }
7872 else
7873 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874}
7875
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007876
7877static PyObject *
7878do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7879{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007881
Benjamin Peterson14339b62009-01-31 16:36:08 +00007882 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7883 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007884
Benjamin Peterson14339b62009-01-31 16:36:08 +00007885 if (sep != NULL && sep != Py_None) {
7886 if (PyUnicode_Check(sep))
7887 return _PyUnicode_XStrip(self, striptype, sep);
7888 else {
7889 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 "%s arg must be None or str",
7891 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007892 return NULL;
7893 }
7894 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007895
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007897}
7898
7899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007900PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007902\n\
7903Return a copy of the string S with leading and trailing\n\
7904whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007905If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007906
7907static PyObject *
7908unicode_strip(PyUnicodeObject *self, PyObject *args)
7909{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007910 if (PyTuple_GET_SIZE(args) == 0)
7911 return do_strip(self, BOTHSTRIP); /* Common case */
7912 else
7913 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007914}
7915
7916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007917PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007919\n\
7920Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007921If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007922
7923static PyObject *
7924unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7925{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007926 if (PyTuple_GET_SIZE(args) == 0)
7927 return do_strip(self, LEFTSTRIP); /* Common case */
7928 else
7929 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007930}
7931
7932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007933PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007935\n\
7936Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007937If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007938
7939static PyObject *
7940unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7941{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007942 if (PyTuple_GET_SIZE(args) == 0)
7943 return do_strip(self, RIGHTSTRIP); /* Common case */
7944 else
7945 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007946}
7947
7948
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951{
7952 PyUnicodeObject *u;
7953 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007955 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956
Georg Brandl222de0f2009-04-12 12:01:50 +00007957 if (len < 1) {
7958 Py_INCREF(unicode_empty);
7959 return (PyObject *)unicode_empty;
7960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961
Tim Peters7a29bd52001-09-12 03:03:31 +00007962 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 /* no repeat, return original string */
7964 Py_INCREF(str);
7965 return (PyObject*) str;
7966 }
Tim Peters8f422462000-09-09 06:13:41 +00007967
7968 /* ensure # of chars needed doesn't overflow int and # of bytes
7969 * needed doesn't overflow size_t
7970 */
7971 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007972 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007973 PyErr_SetString(PyExc_OverflowError,
7974 "repeated string is too long");
7975 return NULL;
7976 }
7977 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7978 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7979 PyErr_SetString(PyExc_OverflowError,
7980 "repeated string is too long");
7981 return NULL;
7982 }
7983 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 if (!u)
7985 return NULL;
7986
7987 p = u->str;
7988
Georg Brandl222de0f2009-04-12 12:01:50 +00007989 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007990 Py_UNICODE_FILL(p, str->str[0], len);
7991 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007992 Py_ssize_t done = str->length; /* number of characters copied this far */
7993 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007995 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007996 Py_UNICODE_COPY(p+done, p, n);
7997 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
8000
8001 return (PyObject*) u;
8002}
8003
8004PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 PyObject *subobj,
8006 PyObject *replobj,
8007 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008{
8009 PyObject *self;
8010 PyObject *str1;
8011 PyObject *str2;
8012 PyObject *result;
8013
8014 self = PyUnicode_FromObject(obj);
8015 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 str1 = PyUnicode_FromObject(subobj);
8018 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 Py_DECREF(self);
8020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 }
8022 str2 = PyUnicode_FromObject(replobj);
8023 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 Py_DECREF(self);
8025 Py_DECREF(str1);
8026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 }
Tim Petersced69f82003-09-16 20:30:58 +00008028 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 (PyUnicodeObject *)str1,
8030 (PyUnicodeObject *)str2,
8031 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 Py_DECREF(self);
8033 Py_DECREF(str1);
8034 Py_DECREF(str2);
8035 return result;
8036}
8037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008038PyDoc_STRVAR(replace__doc__,
Ezio Melotti415f3402010-06-26 18:52:26 +00008039 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040\n\
8041Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008042old replaced by new. If the optional argument count is\n\
8043given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044
8045static PyObject*
8046unicode_replace(PyUnicodeObject *self, PyObject *args)
8047{
8048 PyUnicodeObject *str1;
8049 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 PyObject *result;
8052
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 return NULL;
8055 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8056 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008059 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 Py_DECREF(str1);
8061 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
8064 result = replace(self, str1, str2, maxcount);
8065
8066 Py_DECREF(str1);
8067 Py_DECREF(str2);
8068 return result;
8069}
8070
8071static
8072PyObject *unicode_repr(PyObject *unicode)
8073{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008074 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008075 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008076 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8077 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8078
8079 /* XXX(nnorwitz): rather than over-allocating, it would be
8080 better to choose a different scheme. Perhaps scan the
8081 first N-chars of the string and allocate based on that size.
8082 */
8083 /* Initial allocation is based on the longest-possible unichr
8084 escape.
8085
8086 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8087 unichr, so in this case it's the longest unichr escape. In
8088 narrow (UTF-16) builds this is five chars per source unichr
8089 since there are two unichrs in the surrogate pair, so in narrow
8090 (UTF-16) builds it's not the longest unichr escape.
8091
8092 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8093 so in the narrow (UTF-16) build case it's the longest unichr
8094 escape.
8095 */
8096
Walter Dörwald1ab83302007-05-18 17:15:44 +00008097 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008099#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008101#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008103#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008105 if (repr == NULL)
8106 return NULL;
8107
Walter Dörwald1ab83302007-05-18 17:15:44 +00008108 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008109
8110 /* Add quote */
8111 *p++ = (findchar(s, size, '\'') &&
8112 !findchar(s, size, '"')) ? '"' : '\'';
8113 while (size-- > 0) {
8114 Py_UNICODE ch = *s++;
8115
8116 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008117 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008118 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008119 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008120 continue;
8121 }
8122
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008124 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008125 *p++ = '\\';
8126 *p++ = 't';
8127 }
8128 else if (ch == '\n') {
8129 *p++ = '\\';
8130 *p++ = 'n';
8131 }
8132 else if (ch == '\r') {
8133 *p++ = '\\';
8134 *p++ = 'r';
8135 }
8136
8137 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008138 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008139 *p++ = '\\';
8140 *p++ = 'x';
8141 *p++ = hexdigits[(ch >> 4) & 0x000F];
8142 *p++ = hexdigits[ch & 0x000F];
8143 }
8144
Georg Brandl559e5d72008-06-11 18:37:52 +00008145 /* Copy ASCII characters as-is */
8146 else if (ch < 0x7F) {
8147 *p++ = ch;
8148 }
8149
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008151 else {
8152 Py_UCS4 ucs = ch;
8153
8154#ifndef Py_UNICODE_WIDE
8155 Py_UNICODE ch2 = 0;
8156 /* Get code point from surrogate pair */
8157 if (size > 0) {
8158 ch2 = *s;
8159 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008164 size--;
8165 }
8166 }
8167#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008169 (categories Z* and C* except ASCII space)
8170 */
8171 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8172 /* Map 8-bit characters to '\xhh' */
8173 if (ucs <= 0xff) {
8174 *p++ = '\\';
8175 *p++ = 'x';
8176 *p++ = hexdigits[(ch >> 4) & 0x000F];
8177 *p++ = hexdigits[ch & 0x000F];
8178 }
8179 /* Map 21-bit characters to '\U00xxxxxx' */
8180 else if (ucs >= 0x10000) {
8181 *p++ = '\\';
8182 *p++ = 'U';
8183 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8184 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8185 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8186 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8187 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8188 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8189 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8190 *p++ = hexdigits[ucs & 0x0000000F];
8191 }
8192 /* Map 16-bit characters to '\uxxxx' */
8193 else {
8194 *p++ = '\\';
8195 *p++ = 'u';
8196 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8197 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8198 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8199 *p++ = hexdigits[ucs & 0x000F];
8200 }
8201 }
8202 /* Copy characters as-is */
8203 else {
8204 *p++ = ch;
8205#ifndef Py_UNICODE_WIDE
8206 if (ucs >= 0x10000)
8207 *p++ = ch2;
8208#endif
8209 }
8210 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008211 }
8212 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008213 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008214
8215 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008216 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008217 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218}
8219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008220PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222\n\
8223Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008224such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225arguments start and end are interpreted as in slice notation.\n\
8226\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008227Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228
8229static PyObject *
8230unicode_rfind(PyUnicodeObject *self, PyObject *args)
8231{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008233 Py_ssize_t start;
8234 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008235 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
Christian Heimes9cd17752007-11-18 19:35:23 +00008237 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
Thomas Wouters477c8d52006-05-27 19:21:47 +00008240 result = stringlib_rfind_slice(
8241 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8242 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8243 start, end
8244 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245
8246 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008247
Christian Heimes217cfd12007-12-02 14:31:20 +00008248 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249}
8250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008251PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008254Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255
8256static PyObject *
8257unicode_rindex(PyUnicodeObject *self, PyObject *args)
8258{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008259 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008260 Py_ssize_t start;
8261 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008262 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
Christian Heimes9cd17752007-11-18 19:35:23 +00008264 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266
Thomas Wouters477c8d52006-05-27 19:21:47 +00008267 result = stringlib_rfind_slice(
8268 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8269 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8270 start, end
8271 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272
8273 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008274
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 if (result < 0) {
8276 PyErr_SetString(PyExc_ValueError, "substring not found");
8277 return NULL;
8278 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008279 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280}
8281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008282PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008285Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008286done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287
8288static PyObject *
8289unicode_rjust(PyUnicodeObject *self, PyObject *args)
8290{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008291 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008292 Py_UNICODE fillchar = ' ';
8293
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008294 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 return NULL;
8296
Tim Peters7a29bd52001-09-12 03:03:31 +00008297 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 Py_INCREF(self);
8299 return (PyObject*) self;
8300 }
8301
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008302 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303}
8304
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 PyObject *sep,
8307 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308{
8309 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008310
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 s = PyUnicode_FromObject(s);
8312 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008313 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 if (sep != NULL) {
8315 sep = PyUnicode_FromObject(sep);
8316 if (sep == NULL) {
8317 Py_DECREF(s);
8318 return NULL;
8319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320 }
8321
8322 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8323
8324 Py_DECREF(s);
8325 Py_XDECREF(sep);
8326 return result;
8327}
8328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008329PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331\n\
8332Return a list of the words in S, using sep as the\n\
8333delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008334splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008335whitespace string is a separator and empty strings are\n\
8336removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
8338static PyObject*
8339unicode_split(PyUnicodeObject *self, PyObject *args)
8340{
8341 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008342 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343
Martin v. Löwis18e16552006-02-15 17:27:45 +00008344 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 return NULL;
8346
8347 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353}
8354
Thomas Wouters477c8d52006-05-27 19:21:47 +00008355PyObject *
8356PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8357{
8358 PyObject* str_obj;
8359 PyObject* sep_obj;
8360 PyObject* out;
8361
8362 str_obj = PyUnicode_FromObject(str_in);
8363 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008365 sep_obj = PyUnicode_FromObject(sep_in);
8366 if (!sep_obj) {
8367 Py_DECREF(str_obj);
8368 return NULL;
8369 }
8370
8371 out = stringlib_partition(
8372 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8373 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8374 );
8375
8376 Py_DECREF(sep_obj);
8377 Py_DECREF(str_obj);
8378
8379 return out;
8380}
8381
8382
8383PyObject *
8384PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8385{
8386 PyObject* str_obj;
8387 PyObject* sep_obj;
8388 PyObject* out;
8389
8390 str_obj = PyUnicode_FromObject(str_in);
8391 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008393 sep_obj = PyUnicode_FromObject(sep_in);
8394 if (!sep_obj) {
8395 Py_DECREF(str_obj);
8396 return NULL;
8397 }
8398
8399 out = stringlib_rpartition(
8400 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8401 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8402 );
8403
8404 Py_DECREF(sep_obj);
8405 Py_DECREF(str_obj);
8406
8407 return out;
8408}
8409
8410PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008412\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008413Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008414the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008415found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008416
8417static PyObject*
8418unicode_partition(PyUnicodeObject *self, PyObject *separator)
8419{
8420 return PyUnicode_Partition((PyObject *)self, separator);
8421}
8422
8423PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti4c81fbb2010-01-25 12:02:24 +00008424 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008425\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008426Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008427the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008428separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008429
8430static PyObject*
8431unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8432{
8433 return PyUnicode_RPartition((PyObject *)self, separator);
8434}
8435
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008436PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 PyObject *sep,
8438 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008439{
8440 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008442 s = PyUnicode_FromObject(s);
8443 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 if (sep != NULL) {
8446 sep = PyUnicode_FromObject(sep);
8447 if (sep == NULL) {
8448 Py_DECREF(s);
8449 return NULL;
8450 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008451 }
8452
8453 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8454
8455 Py_DECREF(s);
8456 Py_XDECREF(sep);
8457 return result;
8458}
8459
8460PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008462\n\
8463Return a list of the words in S, using sep as the\n\
8464delimiter string, starting at the end of the string and\n\
8465working to the front. If maxsplit is given, at most maxsplit\n\
8466splits are done. If sep is not specified, any whitespace string\n\
8467is a separator.");
8468
8469static PyObject*
8470unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8471{
8472 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008473 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008474
Martin v. Löwis18e16552006-02-15 17:27:45 +00008475 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008476 return NULL;
8477
8478 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008480 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008482 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008484}
8485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008486PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488\n\
8489Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008490Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008491is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492
8493static PyObject*
8494unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8495{
Guido van Rossum86662912000-04-11 15:38:46 +00008496 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497
Guido van Rossum86662912000-04-11 15:38:46 +00008498 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499 return NULL;
8500
Guido van Rossum86662912000-04-11 15:38:46 +00008501 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502}
8503
8504static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008505PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506{
Walter Dörwald346737f2007-05-31 10:44:43 +00008507 if (PyUnicode_CheckExact(self)) {
8508 Py_INCREF(self);
8509 return self;
8510 } else
8511 /* Subtype -- return genuine unicode string with the same value. */
8512 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8513 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514}
8515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008516PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518\n\
8519Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008520and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521
8522static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008523unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 return fixup(self, fixswapcase);
8526}
8527
Georg Brandlceee0772007-11-27 23:48:05 +00008528PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008530\n\
8531Return a translation table usable for str.translate().\n\
8532If there is only one argument, it must be a dictionary mapping Unicode\n\
8533ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008534Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008535If there are two arguments, they must be strings of equal length, and\n\
8536in the resulting dictionary, each character in x will be mapped to the\n\
8537character at the same position in y. If there is a third argument, it\n\
8538must be a string, whose characters will be mapped to None in the result.");
8539
8540static PyObject*
8541unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8542{
8543 PyObject *x, *y = NULL, *z = NULL;
8544 PyObject *new = NULL, *key, *value;
8545 Py_ssize_t i = 0;
8546 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008547
Georg Brandlceee0772007-11-27 23:48:05 +00008548 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8549 return NULL;
8550 new = PyDict_New();
8551 if (!new)
8552 return NULL;
8553 if (y != NULL) {
8554 /* x must be a string too, of equal length */
8555 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8556 if (!PyUnicode_Check(x)) {
8557 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8558 "be a string if there is a second argument");
8559 goto err;
8560 }
8561 if (PyUnicode_GET_SIZE(x) != ylen) {
8562 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8563 "arguments must have equal length");
8564 goto err;
8565 }
8566 /* create entries for translating chars in x to those in y */
8567 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008568 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8569 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008570 if (!key || !value)
8571 goto err;
8572 res = PyDict_SetItem(new, key, value);
8573 Py_DECREF(key);
8574 Py_DECREF(value);
8575 if (res < 0)
8576 goto err;
8577 }
8578 /* create entries for deleting chars in z */
8579 if (z != NULL) {
8580 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008581 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008582 if (!key)
8583 goto err;
8584 res = PyDict_SetItem(new, key, Py_None);
8585 Py_DECREF(key);
8586 if (res < 0)
8587 goto err;
8588 }
8589 }
8590 } else {
8591 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008592 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008593 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8594 "to maketrans it must be a dict");
8595 goto err;
8596 }
8597 /* copy entries into the new dict, converting string keys to int keys */
8598 while (PyDict_Next(x, &i, &key, &value)) {
8599 if (PyUnicode_Check(key)) {
8600 /* convert string keys to integer keys */
8601 PyObject *newkey;
8602 if (PyUnicode_GET_SIZE(key) != 1) {
8603 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8604 "table must be of length 1");
8605 goto err;
8606 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008607 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008608 if (!newkey)
8609 goto err;
8610 res = PyDict_SetItem(new, newkey, value);
8611 Py_DECREF(newkey);
8612 if (res < 0)
8613 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008614 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008615 /* just keep integer keys */
8616 if (PyDict_SetItem(new, key, value) < 0)
8617 goto err;
8618 } else {
8619 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8620 "be strings or integers");
8621 goto err;
8622 }
8623 }
8624 }
8625 return new;
8626 err:
8627 Py_DECREF(new);
8628 return NULL;
8629}
8630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008631PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633\n\
8634Return a copy of the string S, where all characters have been mapped\n\
8635through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008636Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008637Unmapped characters are left untouched. Characters mapped to None\n\
8638are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639
8640static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008641unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642{
Georg Brandlceee0772007-11-27 23:48:05 +00008643 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644}
8645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008646PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008649Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650
8651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008652unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 return fixup(self, fixupper);
8655}
8656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008657PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008660Pad a numeric string S with zeros on the left, to fill a field\n\
8661of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662
8663static PyObject *
8664unicode_zfill(PyUnicodeObject *self, PyObject *args)
8665{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008666 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 PyUnicodeObject *u;
8668
Martin v. Löwis18e16552006-02-15 17:27:45 +00008669 Py_ssize_t width;
8670 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 return NULL;
8672
8673 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008674 if (PyUnicode_CheckExact(self)) {
8675 Py_INCREF(self);
8676 return (PyObject*) self;
8677 }
8678 else
8679 return PyUnicode_FromUnicode(
8680 PyUnicode_AS_UNICODE(self),
8681 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 }
8684
8685 fill = width - self->length;
8686
8687 u = pad(self, fill, 0, '0');
8688
Walter Dörwald068325e2002-04-15 13:36:47 +00008689 if (u == NULL)
8690 return NULL;
8691
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 if (u->str[fill] == '+' || u->str[fill] == '-') {
8693 /* move sign to beginning of string */
8694 u->str[0] = u->str[fill];
8695 u->str[fill] = '0';
8696 }
8697
8698 return (PyObject*) u;
8699}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700
8701#if 0
8702static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008703unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704{
Christian Heimes2202f872008-02-06 14:31:34 +00008705 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706}
8707#endif
8708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008709PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008712Return True if S starts with the specified prefix, False otherwise.\n\
8713With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008714With optional end, stop comparing S at that position.\n\
8715prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716
8717static PyObject *
8718unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008721 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008723 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008724 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008725 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008727 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8729 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008730 if (PyTuple_Check(subobj)) {
8731 Py_ssize_t i;
8732 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8733 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008735 if (substring == NULL)
8736 return NULL;
8737 result = tailmatch(self, substring, start, end, -1);
8738 Py_DECREF(substring);
8739 if (result) {
8740 Py_RETURN_TRUE;
8741 }
8742 }
8743 /* nothing matched */
8744 Py_RETURN_FALSE;
8745 }
8746 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008749 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008751 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752}
8753
8754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008755PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008758Return True if S ends with the specified suffix, False otherwise.\n\
8759With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008760With optional end, stop comparing S at that position.\n\
8761suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762
8763static PyObject *
8764unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008767 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008769 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008770 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008771 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008773 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8775 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008776 if (PyTuple_Check(subobj)) {
8777 Py_ssize_t i;
8778 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8779 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008781 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008783 result = tailmatch(self, substring, start, end, +1);
8784 Py_DECREF(substring);
8785 if (result) {
8786 Py_RETURN_TRUE;
8787 }
8788 }
8789 Py_RETURN_FALSE;
8790 }
8791 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008795 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008796 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008797 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798}
8799
Eric Smith8c663262007-08-25 02:26:07 +00008800#include "stringlib/string_format.h"
8801
8802PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008804\n\
8805");
8806
Eric Smith4a7d76d2008-05-30 18:10:19 +00008807static PyObject *
8808unicode__format__(PyObject* self, PyObject* args)
8809{
8810 PyObject *format_spec;
8811
8812 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8813 return NULL;
8814
8815 return _PyUnicode_FormatAdvanced(self,
8816 PyUnicode_AS_UNICODE(format_spec),
8817 PyUnicode_GET_SIZE(format_spec));
8818}
8819
Eric Smith8c663262007-08-25 02:26:07 +00008820PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008822\n\
8823");
8824
8825static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008826unicode__sizeof__(PyUnicodeObject *v)
8827{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008828 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8829 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008830}
8831
8832PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008834
8835static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008836unicode_getnewargs(PyUnicodeObject *v)
8837{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008838 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008839}
8840
8841
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842static PyMethodDef unicode_methods[] = {
8843
8844 /* Order is according to common usage: often used methods should
8845 appear first, since lookup is done sequentially. */
8846
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008847 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8848 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8849 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008850 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008851 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8852 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8853 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8854 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8855 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8856 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8857 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008858 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008859 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8860 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8861 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008862 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008863 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8864 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8865 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008866 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008867 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008868 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008869 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008870 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8871 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8872 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8873 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8874 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8875 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8876 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8877 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8878 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8879 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8880 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8881 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8882 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8883 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008884 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008885 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008886 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008887 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008888 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008889 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8890 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008891 {"maketrans", (PyCFunction) unicode_maketrans,
8892 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008893 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008894#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008895 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896#endif
8897
8898#if 0
8899 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008900 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901#endif
8902
Benjamin Peterson14339b62009-01-31 16:36:08 +00008903 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904 {NULL, NULL}
8905};
8906
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008907static PyObject *
8908unicode_mod(PyObject *v, PyObject *w)
8909{
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 if (!PyUnicode_Check(v)) {
8911 Py_INCREF(Py_NotImplemented);
8912 return Py_NotImplemented;
8913 }
8914 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008915}
8916
8917static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008918 0, /*nb_add*/
8919 0, /*nb_subtract*/
8920 0, /*nb_multiply*/
8921 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008922};
8923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008925 (lenfunc) unicode_length, /* sq_length */
8926 PyUnicode_Concat, /* sq_concat */
8927 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8928 (ssizeargfunc) unicode_getitem, /* sq_item */
8929 0, /* sq_slice */
8930 0, /* sq_ass_item */
8931 0, /* sq_ass_slice */
8932 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933};
8934
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008935static PyObject*
8936unicode_subscript(PyUnicodeObject* self, PyObject* item)
8937{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008938 if (PyIndex_Check(item)) {
8939 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008940 if (i == -1 && PyErr_Occurred())
8941 return NULL;
8942 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008943 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008944 return unicode_getitem(self, i);
8945 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008946 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008947 Py_UNICODE* source_buf;
8948 Py_UNICODE* result_buf;
8949 PyObject* result;
8950
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008951 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008953 return NULL;
8954 }
8955
8956 if (slicelength <= 0) {
8957 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008958 } else if (start == 0 && step == 1 && slicelength == self->length &&
8959 PyUnicode_CheckExact(self)) {
8960 Py_INCREF(self);
8961 return (PyObject *)self;
8962 } else if (step == 1) {
8963 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008964 } else {
8965 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008966 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8967 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008968
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 if (result_buf == NULL)
8970 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008971
8972 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8973 result_buf[i] = source_buf[cur];
8974 }
Tim Petersced69f82003-09-16 20:30:58 +00008975
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008976 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008977 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008978 return result;
8979 }
8980 } else {
8981 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8982 return NULL;
8983 }
8984}
8985
8986static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008987 (lenfunc)unicode_length, /* mp_length */
8988 (binaryfunc)unicode_subscript, /* mp_subscript */
8989 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008990};
8991
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993/* Helpers for PyUnicode_Format() */
8994
8995static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008996getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008998 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 (*p_argidx)++;
9001 if (arglen < 0)
9002 return args;
9003 else
9004 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 }
9006 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 return NULL;
9009}
9010
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009011/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009013static PyObject *
9014formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009016 char *p;
9017 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009019
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 x = PyFloat_AsDouble(v);
9021 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009022 return NULL;
9023
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009026
Eric Smith0923d1d2009-04-16 20:16:10 +00009027 p = PyOS_double_to_string(x, type, prec,
9028 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009029 if (p == NULL)
9030 return NULL;
9031 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009032 PyMem_Free(p);
9033 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034}
9035
Tim Peters38fd5b62000-09-21 05:43:11 +00009036static PyObject*
9037formatlong(PyObject *val, int flags, int prec, int type)
9038{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009039 char *buf;
9040 int len;
9041 PyObject *str; /* temporary string object. */
9042 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009043
Benjamin Peterson14339b62009-01-31 16:36:08 +00009044 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9045 if (!str)
9046 return NULL;
9047 result = PyUnicode_FromStringAndSize(buf, len);
9048 Py_DECREF(str);
9049 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009050}
9051
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052static int
9053formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009054 size_t buflen,
9055 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009057 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009058 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 if (PyUnicode_GET_SIZE(v) == 1) {
9060 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9061 buf[1] = '\0';
9062 return 1;
9063 }
9064#ifndef Py_UNICODE_WIDE
9065 if (PyUnicode_GET_SIZE(v) == 2) {
9066 /* Decode a valid surrogate pair */
9067 int c0 = PyUnicode_AS_UNICODE(v)[0];
9068 int c1 = PyUnicode_AS_UNICODE(v)[1];
9069 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9070 0xDC00 <= c1 && c1 <= 0xDFFF) {
9071 buf[0] = c0;
9072 buf[1] = c1;
9073 buf[2] = '\0';
9074 return 2;
9075 }
9076 }
9077#endif
9078 goto onError;
9079 }
9080 else {
9081 /* Integer input truncated to a character */
9082 long x;
9083 x = PyLong_AsLong(v);
9084 if (x == -1 && PyErr_Occurred())
9085 goto onError;
9086
9087 if (x < 0 || x > 0x10ffff) {
9088 PyErr_SetString(PyExc_OverflowError,
9089 "%c arg not in range(0x110000)");
9090 return -1;
9091 }
9092
9093#ifndef Py_UNICODE_WIDE
9094 if (x > 0xffff) {
9095 x -= 0x10000;
9096 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9097 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9098 return 2;
9099 }
9100#endif
9101 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009102 buf[1] = '\0';
9103 return 1;
9104 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009105
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009109 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110}
9111
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009112/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009113 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009114*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009115#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009116
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119{
9120 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009121 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 int args_owned = 0;
9123 PyUnicodeObject *result = NULL;
9124 PyObject *dict = NULL;
9125 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009126
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 PyErr_BadInternalCall();
9129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 }
9131 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009132 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 fmt = PyUnicode_AS_UNICODE(uformat);
9135 fmtcnt = PyUnicode_GET_SIZE(uformat);
9136
9137 reslen = rescnt = fmtcnt + 100;
9138 result = _PyUnicode_New(reslen);
9139 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 res = PyUnicode_AS_UNICODE(result);
9142
9143 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 arglen = PyTuple_Size(args);
9145 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 }
9147 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 arglen = -1;
9149 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009151 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009152 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154
9155 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 if (*fmt != '%') {
9157 if (--rescnt < 0) {
9158 rescnt = fmtcnt + 100;
9159 reslen += rescnt;
9160 if (_PyUnicode_Resize(&result, reslen) < 0)
9161 goto onError;
9162 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9163 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009164 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009166 }
9167 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 /* Got a format specifier */
9169 int flags = 0;
9170 Py_ssize_t width = -1;
9171 int prec = -1;
9172 Py_UNICODE c = '\0';
9173 Py_UNICODE fill;
9174 int isnumok;
9175 PyObject *v = NULL;
9176 PyObject *temp = NULL;
9177 Py_UNICODE *pbuf;
9178 Py_UNICODE sign;
9179 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009180 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 fmt++;
9183 if (*fmt == '(') {
9184 Py_UNICODE *keystart;
9185 Py_ssize_t keylen;
9186 PyObject *key;
9187 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009188
Benjamin Peterson29060642009-01-31 22:14:21 +00009189 if (dict == NULL) {
9190 PyErr_SetString(PyExc_TypeError,
9191 "format requires a mapping");
9192 goto onError;
9193 }
9194 ++fmt;
9195 --fmtcnt;
9196 keystart = fmt;
9197 /* Skip over balanced parentheses */
9198 while (pcount > 0 && --fmtcnt >= 0) {
9199 if (*fmt == ')')
9200 --pcount;
9201 else if (*fmt == '(')
9202 ++pcount;
9203 fmt++;
9204 }
9205 keylen = fmt - keystart - 1;
9206 if (fmtcnt < 0 || pcount > 0) {
9207 PyErr_SetString(PyExc_ValueError,
9208 "incomplete format key");
9209 goto onError;
9210 }
9211#if 0
9212 /* keys are converted to strings using UTF-8 and
9213 then looked up since Python uses strings to hold
9214 variables names etc. in its namespaces and we
9215 wouldn't want to break common idioms. */
9216 key = PyUnicode_EncodeUTF8(keystart,
9217 keylen,
9218 NULL);
9219#else
9220 key = PyUnicode_FromUnicode(keystart, keylen);
9221#endif
9222 if (key == NULL)
9223 goto onError;
9224 if (args_owned) {
9225 Py_DECREF(args);
9226 args_owned = 0;
9227 }
9228 args = PyObject_GetItem(dict, key);
9229 Py_DECREF(key);
9230 if (args == NULL) {
9231 goto onError;
9232 }
9233 args_owned = 1;
9234 arglen = -1;
9235 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009236 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 while (--fmtcnt >= 0) {
9238 switch (c = *fmt++) {
9239 case '-': flags |= F_LJUST; continue;
9240 case '+': flags |= F_SIGN; continue;
9241 case ' ': flags |= F_BLANK; continue;
9242 case '#': flags |= F_ALT; continue;
9243 case '0': flags |= F_ZERO; continue;
9244 }
9245 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 if (c == '*') {
9248 v = getnextarg(args, arglen, &argidx);
9249 if (v == NULL)
9250 goto onError;
9251 if (!PyLong_Check(v)) {
9252 PyErr_SetString(PyExc_TypeError,
9253 "* wants int");
9254 goto onError;
9255 }
9256 width = PyLong_AsLong(v);
9257 if (width == -1 && PyErr_Occurred())
9258 goto onError;
9259 if (width < 0) {
9260 flags |= F_LJUST;
9261 width = -width;
9262 }
9263 if (--fmtcnt >= 0)
9264 c = *fmt++;
9265 }
9266 else if (c >= '0' && c <= '9') {
9267 width = c - '0';
9268 while (--fmtcnt >= 0) {
9269 c = *fmt++;
9270 if (c < '0' || c > '9')
9271 break;
9272 if ((width*10) / 10 != width) {
9273 PyErr_SetString(PyExc_ValueError,
9274 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009275 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009276 }
9277 width = width*10 + (c - '0');
9278 }
9279 }
9280 if (c == '.') {
9281 prec = 0;
9282 if (--fmtcnt >= 0)
9283 c = *fmt++;
9284 if (c == '*') {
9285 v = getnextarg(args, arglen, &argidx);
9286 if (v == NULL)
9287 goto onError;
9288 if (!PyLong_Check(v)) {
9289 PyErr_SetString(PyExc_TypeError,
9290 "* wants int");
9291 goto onError;
9292 }
9293 prec = PyLong_AsLong(v);
9294 if (prec == -1 && PyErr_Occurred())
9295 goto onError;
9296 if (prec < 0)
9297 prec = 0;
9298 if (--fmtcnt >= 0)
9299 c = *fmt++;
9300 }
9301 else if (c >= '0' && c <= '9') {
9302 prec = c - '0';
9303 while (--fmtcnt >= 0) {
Stefan Krahaebd6f42010-07-19 18:01:13 +00009304 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 if (c < '0' || c > '9')
9306 break;
9307 if ((prec*10) / 10 != prec) {
9308 PyErr_SetString(PyExc_ValueError,
9309 "prec too big");
9310 goto onError;
9311 }
9312 prec = prec*10 + (c - '0');
9313 }
9314 }
9315 } /* prec */
9316 if (fmtcnt >= 0) {
9317 if (c == 'h' || c == 'l' || c == 'L') {
9318 if (--fmtcnt >= 0)
9319 c = *fmt++;
9320 }
9321 }
9322 if (fmtcnt < 0) {
9323 PyErr_SetString(PyExc_ValueError,
9324 "incomplete format");
9325 goto onError;
9326 }
9327 if (c != '%') {
9328 v = getnextarg(args, arglen, &argidx);
9329 if (v == NULL)
9330 goto onError;
9331 }
9332 sign = 0;
9333 fill = ' ';
9334 switch (c) {
9335
9336 case '%':
9337 pbuf = formatbuf;
9338 /* presume that buffer length is at least 1 */
9339 pbuf[0] = '%';
9340 len = 1;
9341 break;
9342
9343 case 's':
9344 case 'r':
9345 case 'a':
Victor Stinnerabdb21a2010-03-22 12:53:14 +00009346 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 temp = v;
9348 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009349 }
9350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 if (c == 's')
9352 temp = PyObject_Str(v);
9353 else if (c == 'r')
9354 temp = PyObject_Repr(v);
9355 else
9356 temp = PyObject_ASCII(v);
9357 if (temp == NULL)
9358 goto onError;
9359 if (PyUnicode_Check(temp))
9360 /* nothing to do */;
9361 else {
9362 Py_DECREF(temp);
9363 PyErr_SetString(PyExc_TypeError,
9364 "%s argument has non-string str()");
9365 goto onError;
9366 }
9367 }
9368 pbuf = PyUnicode_AS_UNICODE(temp);
9369 len = PyUnicode_GET_SIZE(temp);
9370 if (prec >= 0 && len > prec)
9371 len = prec;
9372 break;
9373
9374 case 'i':
9375 case 'd':
9376 case 'u':
9377 case 'o':
9378 case 'x':
9379 case 'X':
9380 if (c == 'i')
9381 c = 'd';
9382 isnumok = 0;
9383 if (PyNumber_Check(v)) {
9384 PyObject *iobj=NULL;
9385
9386 if (PyLong_Check(v)) {
9387 iobj = v;
9388 Py_INCREF(iobj);
9389 }
9390 else {
9391 iobj = PyNumber_Long(v);
9392 }
9393 if (iobj!=NULL) {
9394 if (PyLong_Check(iobj)) {
9395 isnumok = 1;
9396 temp = formatlong(iobj, flags, prec, c);
9397 Py_DECREF(iobj);
9398 if (!temp)
9399 goto onError;
9400 pbuf = PyUnicode_AS_UNICODE(temp);
9401 len = PyUnicode_GET_SIZE(temp);
9402 sign = 1;
9403 }
9404 else {
9405 Py_DECREF(iobj);
9406 }
9407 }
9408 }
9409 if (!isnumok) {
9410 PyErr_Format(PyExc_TypeError,
9411 "%%%c format: a number is required, "
9412 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9413 goto onError;
9414 }
9415 if (flags & F_ZERO)
9416 fill = '0';
9417 break;
9418
9419 case 'e':
9420 case 'E':
9421 case 'f':
9422 case 'F':
9423 case 'g':
9424 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009425 temp = formatfloat(v, flags, prec, c);
9426 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009428 pbuf = PyUnicode_AS_UNICODE(temp);
9429 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009430 sign = 1;
9431 if (flags & F_ZERO)
9432 fill = '0';
9433 break;
9434
9435 case 'c':
9436 pbuf = formatbuf;
9437 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9438 if (len < 0)
9439 goto onError;
9440 break;
9441
9442 default:
9443 PyErr_Format(PyExc_ValueError,
9444 "unsupported format character '%c' (0x%x) "
9445 "at index %zd",
9446 (31<=c && c<=126) ? (char)c : '?',
9447 (int)c,
9448 (Py_ssize_t)(fmt - 1 -
9449 PyUnicode_AS_UNICODE(uformat)));
9450 goto onError;
9451 }
9452 if (sign) {
9453 if (*pbuf == '-' || *pbuf == '+') {
9454 sign = *pbuf++;
9455 len--;
9456 }
9457 else if (flags & F_SIGN)
9458 sign = '+';
9459 else if (flags & F_BLANK)
9460 sign = ' ';
9461 else
9462 sign = 0;
9463 }
9464 if (width < len)
9465 width = len;
9466 if (rescnt - (sign != 0) < width) {
9467 reslen -= rescnt;
9468 rescnt = width + fmtcnt + 100;
9469 reslen += rescnt;
9470 if (reslen < 0) {
9471 Py_XDECREF(temp);
9472 PyErr_NoMemory();
9473 goto onError;
9474 }
9475 if (_PyUnicode_Resize(&result, reslen) < 0) {
9476 Py_XDECREF(temp);
9477 goto onError;
9478 }
9479 res = PyUnicode_AS_UNICODE(result)
9480 + reslen - rescnt;
9481 }
9482 if (sign) {
9483 if (fill != ' ')
9484 *res++ = sign;
9485 rescnt--;
9486 if (width > len)
9487 width--;
9488 }
9489 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9490 assert(pbuf[0] == '0');
9491 assert(pbuf[1] == c);
9492 if (fill != ' ') {
9493 *res++ = *pbuf++;
9494 *res++ = *pbuf++;
9495 }
9496 rescnt -= 2;
9497 width -= 2;
9498 if (width < 0)
9499 width = 0;
9500 len -= 2;
9501 }
9502 if (width > len && !(flags & F_LJUST)) {
9503 do {
9504 --rescnt;
9505 *res++ = fill;
9506 } while (--width > len);
9507 }
9508 if (fill == ' ') {
9509 if (sign)
9510 *res++ = sign;
9511 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9512 assert(pbuf[0] == '0');
9513 assert(pbuf[1] == c);
9514 *res++ = *pbuf++;
9515 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009516 }
9517 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 Py_UNICODE_COPY(res, pbuf, len);
9519 res += len;
9520 rescnt -= len;
9521 while (--width >= len) {
9522 --rescnt;
9523 *res++ = ' ';
9524 }
9525 if (dict && (argidx < arglen) && c != '%') {
9526 PyErr_SetString(PyExc_TypeError,
9527 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009528 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 goto onError;
9530 }
9531 Py_XDECREF(temp);
9532 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 } /* until end */
9534 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 PyErr_SetString(PyExc_TypeError,
9536 "not all arguments converted during string formatting");
9537 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 }
9539
Thomas Woutersa96affe2006-03-12 00:29:36 +00009540 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 }
9545 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546 return (PyObject *)result;
9547
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549 Py_XDECREF(result);
9550 Py_DECREF(uformat);
9551 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553 }
9554 return NULL;
9555}
9556
Jeremy Hylton938ace62002-07-17 16:30:39 +00009557static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009558unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9559
Tim Peters6d6c1a32001-08-02 04:15:00 +00009560static PyObject *
9561unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9562{
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009564 static char *kwlist[] = {"object", "encoding", "errors", 0};
9565 char *encoding = NULL;
9566 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009567
Benjamin Peterson14339b62009-01-31 16:36:08 +00009568 if (type != &PyUnicode_Type)
9569 return unicode_subtype_new(type, args, kwds);
9570 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 return NULL;
9573 if (x == NULL)
9574 return (PyObject *)_PyUnicode_New(0);
9575 if (encoding == NULL && errors == NULL)
9576 return PyObject_Str(x);
9577 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009579}
9580
Guido van Rossume023fe02001-08-30 03:12:59 +00009581static PyObject *
9582unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9583{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 PyUnicodeObject *tmp, *pnew;
9585 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009586
Benjamin Peterson14339b62009-01-31 16:36:08 +00009587 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9588 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9589 if (tmp == NULL)
9590 return NULL;
9591 assert(PyUnicode_Check(tmp));
9592 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9593 if (pnew == NULL) {
9594 Py_DECREF(tmp);
9595 return NULL;
9596 }
9597 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9598 if (pnew->str == NULL) {
9599 _Py_ForgetReference((PyObject *)pnew);
9600 PyObject_Del(pnew);
9601 Py_DECREF(tmp);
9602 return PyErr_NoMemory();
9603 }
9604 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9605 pnew->length = n;
9606 pnew->hash = tmp->hash;
9607 Py_DECREF(tmp);
9608 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009609}
9610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009611PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009613\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009614Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009615encoding defaults to the current default string encoding.\n\
9616errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009617
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009618static PyObject *unicode_iter(PyObject *seq);
9619
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009621 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009622 "str", /* tp_name */
9623 sizeof(PyUnicodeObject), /* tp_size */
9624 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009626 (destructor)unicode_dealloc, /* tp_dealloc */
9627 0, /* tp_print */
9628 0, /* tp_getattr */
9629 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009630 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009631 unicode_repr, /* tp_repr */
9632 &unicode_as_number, /* tp_as_number */
9633 &unicode_as_sequence, /* tp_as_sequence */
9634 &unicode_as_mapping, /* tp_as_mapping */
9635 (hashfunc) unicode_hash, /* tp_hash*/
9636 0, /* tp_call*/
9637 (reprfunc) unicode_str, /* tp_str */
9638 PyObject_GenericGetAttr, /* tp_getattro */
9639 0, /* tp_setattro */
9640 0, /* tp_as_buffer */
9641 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009642 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009643 unicode_doc, /* tp_doc */
9644 0, /* tp_traverse */
9645 0, /* tp_clear */
9646 PyUnicode_RichCompare, /* tp_richcompare */
9647 0, /* tp_weaklistoffset */
9648 unicode_iter, /* tp_iter */
9649 0, /* tp_iternext */
9650 unicode_methods, /* tp_methods */
9651 0, /* tp_members */
9652 0, /* tp_getset */
9653 &PyBaseObject_Type, /* tp_base */
9654 0, /* tp_dict */
9655 0, /* tp_descr_get */
9656 0, /* tp_descr_set */
9657 0, /* tp_dictoffset */
9658 0, /* tp_init */
9659 0, /* tp_alloc */
9660 unicode_new, /* tp_new */
9661 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662};
9663
9664/* Initialize the Unicode implementation */
9665
Thomas Wouters78890102000-07-22 19:25:51 +00009666void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009668 int i;
9669
Thomas Wouters477c8d52006-05-27 19:21:47 +00009670 /* XXX - move this array to unicodectype.c ? */
9671 Py_UNICODE linebreak[] = {
9672 0x000A, /* LINE FEED */
9673 0x000D, /* CARRIAGE RETURN */
9674 0x001C, /* FILE SEPARATOR */
9675 0x001D, /* GROUP SEPARATOR */
9676 0x001E, /* RECORD SEPARATOR */
9677 0x0085, /* NEXT LINE */
9678 0x2028, /* LINE SEPARATOR */
9679 0x2029, /* PARAGRAPH SEPARATOR */
9680 };
9681
Fred Drakee4315f52000-05-09 19:53:39 +00009682 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009683 free_list = NULL;
9684 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009686 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009687 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009688
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009689 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009690 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009691 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009692 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009693
9694 /* initialize the linebreak bloom filter */
9695 bloom_linebreak = make_bloom_mask(
9696 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9697 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009698
9699 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700}
9701
9702/* Finalize the Unicode implementation */
9703
Christian Heimesa156e092008-02-16 07:38:31 +00009704int
9705PyUnicode_ClearFreeList(void)
9706{
9707 int freelist_size = numfree;
9708 PyUnicodeObject *u;
9709
9710 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 PyUnicodeObject *v = u;
9712 u = *(PyUnicodeObject **)u;
9713 if (v->str)
9714 PyObject_DEL(v->str);
9715 Py_XDECREF(v->defenc);
9716 PyObject_Del(v);
9717 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009718 }
9719 free_list = NULL;
9720 assert(numfree == 0);
9721 return freelist_size;
9722}
9723
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724void
Thomas Wouters78890102000-07-22 19:25:51 +00009725_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009727 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009729 Py_XDECREF(unicode_empty);
9730 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009731
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009732 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 if (unicode_latin1[i]) {
9734 Py_DECREF(unicode_latin1[i]);
9735 unicode_latin1[i] = NULL;
9736 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009737 }
Christian Heimesa156e092008-02-16 07:38:31 +00009738 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009739}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009740
Walter Dörwald16807132007-05-25 13:52:07 +00009741void
9742PyUnicode_InternInPlace(PyObject **p)
9743{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009744 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9745 PyObject *t;
9746 if (s == NULL || !PyUnicode_Check(s))
9747 Py_FatalError(
9748 "PyUnicode_InternInPlace: unicode strings only please!");
9749 /* If it's a subclass, we don't really know what putting
9750 it in the interned dict might do. */
9751 if (!PyUnicode_CheckExact(s))
9752 return;
9753 if (PyUnicode_CHECK_INTERNED(s))
9754 return;
9755 if (interned == NULL) {
9756 interned = PyDict_New();
9757 if (interned == NULL) {
9758 PyErr_Clear(); /* Don't leave an exception */
9759 return;
9760 }
9761 }
9762 /* It might be that the GetItem call fails even
9763 though the key is present in the dictionary,
9764 namely when this happens during a stack overflow. */
9765 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009766 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009767 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009768
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 if (t) {
9770 Py_INCREF(t);
9771 Py_DECREF(*p);
9772 *p = t;
9773 return;
9774 }
Walter Dörwald16807132007-05-25 13:52:07 +00009775
Benjamin Peterson14339b62009-01-31 16:36:08 +00009776 PyThreadState_GET()->recursion_critical = 1;
9777 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9778 PyErr_Clear();
9779 PyThreadState_GET()->recursion_critical = 0;
9780 return;
9781 }
9782 PyThreadState_GET()->recursion_critical = 0;
9783 /* The two references in interned are not counted by refcnt.
9784 The deallocator will take care of this */
9785 Py_REFCNT(s) -= 2;
9786 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009787}
9788
9789void
9790PyUnicode_InternImmortal(PyObject **p)
9791{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009792 PyUnicode_InternInPlace(p);
9793 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9794 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9795 Py_INCREF(*p);
9796 }
Walter Dörwald16807132007-05-25 13:52:07 +00009797}
9798
9799PyObject *
9800PyUnicode_InternFromString(const char *cp)
9801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009802 PyObject *s = PyUnicode_FromString(cp);
9803 if (s == NULL)
9804 return NULL;
9805 PyUnicode_InternInPlace(&s);
9806 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009807}
9808
9809void _Py_ReleaseInternedUnicodeStrings(void)
9810{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009811 PyObject *keys;
9812 PyUnicodeObject *s;
9813 Py_ssize_t i, n;
9814 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009815
Benjamin Peterson14339b62009-01-31 16:36:08 +00009816 if (interned == NULL || !PyDict_Check(interned))
9817 return;
9818 keys = PyDict_Keys(interned);
9819 if (keys == NULL || !PyList_Check(keys)) {
9820 PyErr_Clear();
9821 return;
9822 }
Walter Dörwald16807132007-05-25 13:52:07 +00009823
Benjamin Peterson14339b62009-01-31 16:36:08 +00009824 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9825 detector, interned unicode strings are not forcibly deallocated;
9826 rather, we give them their stolen references back, and then clear
9827 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009828
Benjamin Peterson14339b62009-01-31 16:36:08 +00009829 n = PyList_GET_SIZE(keys);
9830 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009832 for (i = 0; i < n; i++) {
9833 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9834 switch (s->state) {
9835 case SSTATE_NOT_INTERNED:
9836 /* XXX Shouldn't happen */
9837 break;
9838 case SSTATE_INTERNED_IMMORTAL:
9839 Py_REFCNT(s) += 1;
9840 immortal_size += s->length;
9841 break;
9842 case SSTATE_INTERNED_MORTAL:
9843 Py_REFCNT(s) += 2;
9844 mortal_size += s->length;
9845 break;
9846 default:
9847 Py_FatalError("Inconsistent interned string state.");
9848 }
9849 s->state = SSTATE_NOT_INTERNED;
9850 }
9851 fprintf(stderr, "total size of all interned strings: "
9852 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9853 "mortal/immortal\n", mortal_size, immortal_size);
9854 Py_DECREF(keys);
9855 PyDict_Clear(interned);
9856 Py_DECREF(interned);
9857 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009858}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009859
9860
9861/********************* Unicode Iterator **************************/
9862
9863typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009864 PyObject_HEAD
9865 Py_ssize_t it_index;
9866 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009867} unicodeiterobject;
9868
9869static void
9870unicodeiter_dealloc(unicodeiterobject *it)
9871{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009872 _PyObject_GC_UNTRACK(it);
9873 Py_XDECREF(it->it_seq);
9874 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009875}
9876
9877static int
9878unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9879{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009880 Py_VISIT(it->it_seq);
9881 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009882}
9883
9884static PyObject *
9885unicodeiter_next(unicodeiterobject *it)
9886{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009887 PyUnicodeObject *seq;
9888 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009889
Benjamin Peterson14339b62009-01-31 16:36:08 +00009890 assert(it != NULL);
9891 seq = it->it_seq;
9892 if (seq == NULL)
9893 return NULL;
9894 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009895
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9897 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009898 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009899 if (item != NULL)
9900 ++it->it_index;
9901 return item;
9902 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009903
Benjamin Peterson14339b62009-01-31 16:36:08 +00009904 Py_DECREF(seq);
9905 it->it_seq = NULL;
9906 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009907}
9908
9909static PyObject *
9910unicodeiter_len(unicodeiterobject *it)
9911{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009912 Py_ssize_t len = 0;
9913 if (it->it_seq)
9914 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9915 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009916}
9917
9918PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9919
9920static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009922 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009923 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009924};
9925
9926PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9928 "str_iterator", /* tp_name */
9929 sizeof(unicodeiterobject), /* tp_basicsize */
9930 0, /* tp_itemsize */
9931 /* methods */
9932 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9933 0, /* tp_print */
9934 0, /* tp_getattr */
9935 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009936 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009937 0, /* tp_repr */
9938 0, /* tp_as_number */
9939 0, /* tp_as_sequence */
9940 0, /* tp_as_mapping */
9941 0, /* tp_hash */
9942 0, /* tp_call */
9943 0, /* tp_str */
9944 PyObject_GenericGetAttr, /* tp_getattro */
9945 0, /* tp_setattro */
9946 0, /* tp_as_buffer */
9947 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9948 0, /* tp_doc */
9949 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9950 0, /* tp_clear */
9951 0, /* tp_richcompare */
9952 0, /* tp_weaklistoffset */
9953 PyObject_SelfIter, /* tp_iter */
9954 (iternextfunc)unicodeiter_next, /* tp_iternext */
9955 unicodeiter_methods, /* tp_methods */
9956 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009957};
9958
9959static PyObject *
9960unicode_iter(PyObject *seq)
9961{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009962 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009963
Benjamin Peterson14339b62009-01-31 16:36:08 +00009964 if (!PyUnicode_Check(seq)) {
9965 PyErr_BadInternalCall();
9966 return NULL;
9967 }
9968 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9969 if (it == NULL)
9970 return NULL;
9971 it->it_index = 0;
9972 Py_INCREF(seq);
9973 it->it_seq = (PyUnicodeObject *)seq;
9974 _PyObject_GC_TRACK(it);
9975 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009976}
9977
Martin v. Löwis5b222132007-06-10 09:51:05 +00009978size_t
9979Py_UNICODE_strlen(const Py_UNICODE *u)
9980{
9981 int res = 0;
9982 while(*u++)
9983 res++;
9984 return res;
9985}
9986
9987Py_UNICODE*
9988Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9989{
9990 Py_UNICODE *u = s1;
9991 while ((*u++ = *s2++));
9992 return s1;
9993}
9994
9995Py_UNICODE*
9996Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9997{
9998 Py_UNICODE *u = s1;
9999 while ((*u++ = *s2++))
10000 if (n-- == 0)
10001 break;
10002 return s1;
10003}
10004
10005int
10006Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10007{
10008 while (*s1 && *s2 && *s1 == *s2)
10009 s1++, s2++;
10010 if (*s1 && *s2)
10011 return (*s1 < *s2) ? -1 : +1;
10012 if (*s1)
10013 return 1;
10014 if (*s2)
10015 return -1;
10016 return 0;
10017}
10018
10019Py_UNICODE*
10020Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10021{
10022 const Py_UNICODE *p;
10023 for (p = s; *p; p++)
10024 if (*p == c)
10025 return (Py_UNICODE*)p;
10026 return NULL;
10027}
10028
10029
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010030#ifdef __cplusplus
10031}
10032#endif
10033
10034
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010035/*
Benjamin Peterson29060642009-01-31 22:14:21 +000010036 Local variables:
10037 c-basic-offset: 4
10038 indent-tabs-mode: nil
10039 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010040*/