blob: cc0bbec1f141b7f978b5b39ba2b87cc3a8d4cdf7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Victor Stinner158701d2010-04-22 19:41:01 +0000162static void raise_encode_exception(PyObject **exceptionObject,
163 const char *encoding,
164 const Py_UNICODE *unicode, Py_ssize_t size,
165 Py_ssize_t startpos, Py_ssize_t endpos,
166 const char *reason);
167
Christian Heimes190d79e2008-01-30 11:58:22 +0000168/* Same for linebreaks */
169static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000171/* 0x000A, * LINE FEED */
172/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000173 0, 0, 1, 0, 0, 1, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000175/* 0x001C, * FILE SEPARATOR */
176/* 0x001D, * GROUP SEPARATOR */
177/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 1, 1, 1, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000183
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
186 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000192};
193
194
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000195Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000196PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000198#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000200#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000201 /* This is actually an illegal character, so it should
202 not be passed to unichr. */
203 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000204#endif
205}
206
Thomas Wouters477c8d52006-05-27 19:21:47 +0000207/* --- Bloom Filters ----------------------------------------------------- */
208
209/* stuff to implement simple "bloom filters" for Unicode characters.
210 to keep things simple, we use a single bitmask, using the least 5
211 bits from each unicode characters as the bit index. */
212
213/* the linebreak mask is set up by Unicode_Init below */
214
215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
219#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
220
Benjamin Peterson29060642009-01-31 22:14:21 +0000221#define BLOOM_LINEBREAK(ch) \
222 ((ch) < 128U ? ascii_linebreak[(ch)] : \
223 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
225Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
226{
227 /* calculate simple bloom-style bitmask for a given unicode string */
228
229 long mask;
230 Py_ssize_t i;
231
232 mask = 0;
233 for (i = 0; i < len; i++)
234 mask |= (1 << (ptr[i] & 0x1F));
235
236 return mask;
237}
238
239Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
240{
241 Py_ssize_t i;
242
243 for (i = 0; i < setlen; i++)
244 if (set[i] == chr)
245 return 1;
246
247 return 0;
248}
249
Benjamin Peterson29060642009-01-31 22:14:21 +0000250#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000251 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
252
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253/* --- Unicode Object ----------------------------------------------------- */
254
255static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000257 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258{
259 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000260
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000261 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000263 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 /* Resizing shared object (unicode_empty or single character
266 objects) in-place is not allowed. Use PyUnicode_Resize()
267 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000270 (unicode->length == 1 &&
271 unicode->str[0] < 256U &&
272 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000274 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 return -1;
276 }
277
Thomas Wouters477c8d52006-05-27 19:21:47 +0000278 /* We allocate one more byte to make sure the string is Ux0000 terminated.
279 The overallocation is also used by fastsearch, which assumes that it's
280 safe to look at str[length] (without making any assumptions about what
281 it contains). */
282
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000284 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000285 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000287 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 PyErr_NoMemory();
289 return -1;
290 }
291 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293
Benjamin Peterson29060642009-01-31 22:14:21 +0000294 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000296 if (unicode->defenc) {
297 Py_DECREF(unicode->defenc);
298 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
417 Py_DECREF(unicode->defenc);
418 unicode->defenc = NULL;
419 }
420 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000421 *(PyUnicodeObject **)unicode = free_list;
422 free_list = unicode;
423 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000424 }
425 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000426 PyObject_DEL(unicode->str);
427 Py_XDECREF(unicode->defenc);
428 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429 }
430}
431
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000432static
433int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000434{
435 register PyUnicodeObject *v;
436
437 /* Argument checks */
438 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyErr_BadInternalCall();
440 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000442 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000443 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000444 PyErr_BadInternalCall();
445 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 }
447
448 /* Resizing unicode_empty and single character objects is not
449 possible since these are being shared. We simply return a fresh
450 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000451 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000452 (v == unicode_empty || v->length == 1)) {
453 PyUnicodeObject *w = _PyUnicode_New(length);
454 if (w == NULL)
455 return -1;
456 Py_UNICODE_COPY(w->str, v->str,
457 length < v->length ? length : v->length);
458 Py_DECREF(*unicode);
459 *unicode = w;
460 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461 }
462
463 /* Note that we don't have to modify *unicode for unshared Unicode
464 objects, since we can modify them in-place. */
465 return unicode_resize(v, length);
466}
467
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000468int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
469{
470 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
471}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000472
Guido van Rossumd57fd912000-03-10 22:53:23 +0000473PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000474 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475{
476 PyUnicodeObject *unicode;
477
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000478 /* If the Unicode data is known at construction time, we can apply
479 some optimizations which share commonly used objects. */
480 if (u != NULL) {
481
Benjamin Peterson29060642009-01-31 22:14:21 +0000482 /* Optimization for empty strings */
483 if (size == 0 && unicode_empty != NULL) {
484 Py_INCREF(unicode_empty);
485 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000486 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000487
488 /* Single character Unicode objects in the Latin-1 range are
489 shared when using this constructor */
490 if (size == 1 && *u < 256) {
491 unicode = unicode_latin1[*u];
492 if (!unicode) {
493 unicode = _PyUnicode_New(1);
494 if (!unicode)
495 return NULL;
496 unicode->str[0] = *u;
497 unicode_latin1[*u] = unicode;
498 }
499 Py_INCREF(unicode);
500 return (PyObject *)unicode;
501 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000502 }
Tim Petersced69f82003-09-16 20:30:58 +0000503
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 unicode = _PyUnicode_New(size);
505 if (!unicode)
506 return NULL;
507
508 /* Copy the Unicode data into the new object */
509 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000510 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000511
512 return (PyObject *)unicode;
513}
514
Walter Dörwaldd2034312007-05-18 16:29:38 +0000515PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000516{
517 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000518
Benjamin Peterson14339b62009-01-31 16:36:08 +0000519 if (size < 0) {
520 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000521 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000522 return NULL;
523 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000524
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000525 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526 some optimizations which share commonly used objects.
527 Also, this means the input must be UTF-8, so fall back to the
528 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000529 if (u != NULL) {
530
Benjamin Peterson29060642009-01-31 22:14:21 +0000531 /* Optimization for empty strings */
532 if (size == 0 && unicode_empty != NULL) {
533 Py_INCREF(unicode_empty);
534 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000535 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000536
537 /* Single characters are shared when using this constructor.
538 Restrict to ASCII, since the input must be UTF-8. */
539 if (size == 1 && Py_CHARMASK(*u) < 128) {
540 unicode = unicode_latin1[Py_CHARMASK(*u)];
541 if (!unicode) {
542 unicode = _PyUnicode_New(1);
543 if (!unicode)
544 return NULL;
545 unicode->str[0] = Py_CHARMASK(*u);
546 unicode_latin1[Py_CHARMASK(*u)] = unicode;
547 }
548 Py_INCREF(unicode);
549 return (PyObject *)unicode;
550 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000551
552 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000553 }
554
Walter Dörwald55507312007-05-18 13:12:10 +0000555 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000556 if (!unicode)
557 return NULL;
558
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000559 return (PyObject *)unicode;
560}
561
Walter Dörwaldd2034312007-05-18 16:29:38 +0000562PyObject *PyUnicode_FromString(const char *u)
563{
564 size_t size = strlen(u);
565 if (size > PY_SSIZE_T_MAX) {
566 PyErr_SetString(PyExc_OverflowError, "input too long");
567 return NULL;
568 }
569
570 return PyUnicode_FromStringAndSize(u, size);
571}
572
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573#ifdef HAVE_WCHAR_H
574
Mark Dickinson081dfee2009-03-18 14:47:41 +0000575#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
576# define CONVERT_WCHAR_TO_SURROGATES
577#endif
578
579#ifdef CONVERT_WCHAR_TO_SURROGATES
580
581/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
582 to convert from UTF32 to UTF16. */
583
584PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
585 Py_ssize_t size)
586{
587 PyUnicodeObject *unicode;
588 register Py_ssize_t i;
589 Py_ssize_t alloc;
590 const wchar_t *orig_w;
591
592 if (w == NULL) {
593 if (size == 0)
594 return PyUnicode_FromStringAndSize(NULL, 0);
595 PyErr_BadInternalCall();
596 return NULL;
597 }
598
599 if (size == -1) {
600 size = wcslen(w);
601 }
602
603 alloc = size;
604 orig_w = w;
605 for (i = size; i > 0; i--) {
606 if (*w > 0xFFFF)
607 alloc++;
608 w++;
609 }
610 w = orig_w;
611 unicode = _PyUnicode_New(alloc);
612 if (!unicode)
613 return NULL;
614
615 /* Copy the wchar_t data into the new object */
616 {
617 register Py_UNICODE *u;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--) {
620 if (*w > 0xFFFF) {
621 wchar_t ordinal = *w++;
622 ordinal -= 0x10000;
623 *u++ = 0xD800 | (ordinal >> 10);
624 *u++ = 0xDC00 | (ordinal & 0x3FF);
625 }
626 else
627 *u++ = *w++;
628 }
629 }
630 return (PyObject *)unicode;
631}
632
633#else
634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000636 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637{
638 PyUnicodeObject *unicode;
639
640 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000641 if (size == 0)
642 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000643 PyErr_BadInternalCall();
644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645 }
646
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == -1) {
648 size = wcslen(w);
649 }
650
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 unicode = _PyUnicode_New(size);
652 if (!unicode)
653 return NULL;
654
655 /* Copy the wchar_t data into the new object */
656#ifdef HAVE_USABLE_WCHAR_T
657 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000658#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000660 register Py_UNICODE *u;
661 register Py_ssize_t i;
662 u = PyUnicode_AS_UNICODE(unicode);
663 for (i = size; i > 0; i--)
664 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 }
666#endif
667
668 return (PyObject *)unicode;
669}
670
Mark Dickinson081dfee2009-03-18 14:47:41 +0000671#endif /* CONVERT_WCHAR_TO_SURROGATES */
672
673#undef CONVERT_WCHAR_TO_SURROGATES
674
Walter Dörwald346737f2007-05-31 10:44:43 +0000675static void
676makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
688 else if (size_tflag) {
689 char *f = PY_FORMAT_SIZE_T;
690 while (*f)
691 *fmt++ = *f++;
692 }
693 *fmt++ = c;
694 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000695}
696
Walter Dörwaldd2034312007-05-18 16:29:38 +0000697#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
698
699PyObject *
700PyUnicode_FromFormatV(const char *format, va_list vargs)
701{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000702 va_list count;
703 Py_ssize_t callcount = 0;
704 PyObject **callresults = NULL;
705 PyObject **callresult = NULL;
706 Py_ssize_t n = 0;
707 int width = 0;
708 int precision = 0;
709 int zeropad;
710 const char* f;
711 Py_UNICODE *s;
712 PyObject *string;
713 /* used by sprintf */
714 char buffer[21];
715 /* use abuffer instead of buffer, if we need more space
716 * (which can happen if there's a format specifier with width). */
717 char *abuffer = NULL;
718 char *realbuffer;
719 Py_ssize_t abuffersize = 0;
720 char fmt[60]; /* should be enough for %0width.precisionld */
721 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722
723#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000724 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725#else
726#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000727 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000728#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000729 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000730#endif
731#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000732 /* step 1: count the number of %S/%R/%A/%s format specifications
733 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
734 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
735 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000736 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000737 if (*f == '%') {
738 if (*(f+1)=='%')
739 continue;
740 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
741 ++callcount;
742 while (ISDIGIT((unsigned)*f))
743 width = (width*10) + *f++ - '0';
744 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
745 ;
746 if (*f == 's')
747 ++callcount;
748 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000749 }
750 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000751 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000752 if (callcount) {
753 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
754 if (!callresults) {
755 PyErr_NoMemory();
756 return NULL;
757 }
758 callresult = callresults;
759 }
760 /* step 3: figure out how large a buffer we need */
761 for (f = format; *f; f++) {
762 if (*f == '%') {
763 const char* p = f;
764 width = 0;
765 while (ISDIGIT((unsigned)*f))
766 width = (width*10) + *f++ - '0';
767 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
768 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000769
Benjamin Peterson14339b62009-01-31 16:36:08 +0000770 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
771 * they don't affect the amount of space we reserve.
772 */
773 if ((*f == 'l' || *f == 'z') &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000774 (f[1] == 'd' || f[1] == 'u'))
775 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000776
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 switch (*f) {
778 case 'c':
779 (void)va_arg(count, int);
780 /* fall through... */
781 case '%':
782 n++;
783 break;
784 case 'd': case 'u': case 'i': case 'x':
785 (void) va_arg(count, int);
786 /* 20 bytes is enough to hold a 64-bit
787 integer. Decimal takes the most space.
788 This isn't enough for octal.
789 If a width is specified we need more
790 (which we allocate later). */
791 if (width < 20)
792 width = 20;
793 n += width;
794 if (abuffersize < width)
795 abuffersize = width;
796 break;
797 case 's':
798 {
799 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000800 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000801 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
802 if (!str)
803 goto fail;
804 n += PyUnicode_GET_SIZE(str);
805 /* Remember the str and switch to the next slot */
806 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000807 break;
808 }
809 case 'U':
810 {
811 PyObject *obj = va_arg(count, PyObject *);
812 assert(obj && PyUnicode_Check(obj));
813 n += PyUnicode_GET_SIZE(obj);
814 break;
815 }
816 case 'V':
817 {
818 PyObject *obj = va_arg(count, PyObject *);
819 const char *str = va_arg(count, const char *);
820 assert(obj || str);
821 assert(!obj || PyUnicode_Check(obj));
822 if (obj)
823 n += PyUnicode_GET_SIZE(obj);
824 else
825 n += strlen(str);
826 break;
827 }
828 case 'S':
829 {
830 PyObject *obj = va_arg(count, PyObject *);
831 PyObject *str;
832 assert(obj);
833 str = PyObject_Str(obj);
834 if (!str)
835 goto fail;
836 n += PyUnicode_GET_SIZE(str);
837 /* Remember the str and switch to the next slot */
838 *callresult++ = str;
839 break;
840 }
841 case 'R':
842 {
843 PyObject *obj = va_arg(count, PyObject *);
844 PyObject *repr;
845 assert(obj);
846 repr = PyObject_Repr(obj);
847 if (!repr)
848 goto fail;
849 n += PyUnicode_GET_SIZE(repr);
850 /* Remember the repr and switch to the next slot */
851 *callresult++ = repr;
852 break;
853 }
854 case 'A':
855 {
856 PyObject *obj = va_arg(count, PyObject *);
857 PyObject *ascii;
858 assert(obj);
859 ascii = PyObject_ASCII(obj);
860 if (!ascii)
861 goto fail;
862 n += PyUnicode_GET_SIZE(ascii);
863 /* Remember the repr and switch to the next slot */
864 *callresult++ = ascii;
865 break;
866 }
867 case 'p':
868 (void) va_arg(count, int);
869 /* maximum 64-bit pointer representation:
870 * 0xffffffffffffffff
871 * so 19 characters is enough.
872 * XXX I count 18 -- what's the extra for?
873 */
874 n += 19;
875 break;
876 default:
877 /* if we stumble upon an unknown
878 formatting code, copy the rest of
879 the format string to the output
880 string. (we cannot just skip the
881 code, since there's no way to know
882 what's in the argument list) */
883 n += strlen(p);
884 goto expand;
885 }
886 } else
887 n++;
888 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000889 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +0000890 if (abuffersize > 20) {
891 abuffer = PyObject_Malloc(abuffersize);
892 if (!abuffer) {
893 PyErr_NoMemory();
894 goto fail;
895 }
896 realbuffer = abuffer;
897 }
898 else
899 realbuffer = buffer;
900 /* step 4: fill the buffer */
901 /* Since we've analyzed how much space we need for the worst case,
902 we don't have to resize the string.
903 There can be no errors beyond this point. */
904 string = PyUnicode_FromUnicode(NULL, n);
905 if (!string)
906 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000907
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 s = PyUnicode_AS_UNICODE(string);
909 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000910
Benjamin Peterson14339b62009-01-31 16:36:08 +0000911 for (f = format; *f; f++) {
912 if (*f == '%') {
913 const char* p = f++;
914 int longflag = 0;
915 int size_tflag = 0;
916 zeropad = (*f == '0');
917 /* parse the width.precision part */
918 width = 0;
919 while (ISDIGIT((unsigned)*f))
920 width = (width*10) + *f++ - '0';
921 precision = 0;
922 if (*f == '.') {
923 f++;
924 while (ISDIGIT((unsigned)*f))
925 precision = (precision*10) + *f++ - '0';
926 }
927 /* handle the long flag, but only for %ld and %lu.
928 others can be added when necessary. */
929 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
930 longflag = 1;
931 ++f;
932 }
933 /* handle the size_t flag. */
934 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
935 size_tflag = 1;
936 ++f;
937 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000938
Benjamin Peterson14339b62009-01-31 16:36:08 +0000939 switch (*f) {
940 case 'c':
941 *s++ = va_arg(vargs, int);
942 break;
943 case 'd':
944 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
945 if (longflag)
946 sprintf(realbuffer, fmt, va_arg(vargs, long));
947 else if (size_tflag)
948 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
949 else
950 sprintf(realbuffer, fmt, va_arg(vargs, int));
951 appendstring(realbuffer);
952 break;
953 case 'u':
954 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
955 if (longflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
957 else if (size_tflag)
958 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
959 else
960 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
961 appendstring(realbuffer);
962 break;
963 case 'i':
964 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
965 sprintf(realbuffer, fmt, va_arg(vargs, int));
966 appendstring(realbuffer);
967 break;
968 case 'x':
969 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
970 sprintf(realbuffer, fmt, va_arg(vargs, int));
971 appendstring(realbuffer);
972 break;
973 case 's':
974 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000975 /* unused, since we already have the result */
976 (void) va_arg(vargs, char *);
977 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
978 PyUnicode_GET_SIZE(*callresult));
979 s += PyUnicode_GET_SIZE(*callresult);
980 /* We're done with the unicode()/repr() => forget it */
981 Py_DECREF(*callresult);
982 /* switch to next unicode()/repr() result */
983 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000984 break;
985 }
986 case 'U':
987 {
988 PyObject *obj = va_arg(vargs, PyObject *);
989 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
990 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
991 s += size;
992 break;
993 }
994 case 'V':
995 {
996 PyObject *obj = va_arg(vargs, PyObject *);
997 const char *str = va_arg(vargs, const char *);
998 if (obj) {
999 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1000 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1001 s += size;
1002 } else {
1003 appendstring(str);
1004 }
1005 break;
1006 }
1007 case 'S':
1008 case 'R':
1009 {
1010 Py_UNICODE *ucopy;
1011 Py_ssize_t usize;
1012 Py_ssize_t upos;
1013 /* unused, since we already have the result */
1014 (void) va_arg(vargs, PyObject *);
1015 ucopy = PyUnicode_AS_UNICODE(*callresult);
1016 usize = PyUnicode_GET_SIZE(*callresult);
1017 for (upos = 0; upos<usize;)
1018 *s++ = ucopy[upos++];
1019 /* We're done with the unicode()/repr() => forget it */
1020 Py_DECREF(*callresult);
1021 /* switch to next unicode()/repr() result */
1022 ++callresult;
1023 break;
1024 }
1025 case 'p':
1026 sprintf(buffer, "%p", va_arg(vargs, void*));
1027 /* %p is ill-defined: ensure leading 0x. */
1028 if (buffer[1] == 'X')
1029 buffer[1] = 'x';
1030 else if (buffer[1] != 'x') {
1031 memmove(buffer+2, buffer, strlen(buffer)+1);
1032 buffer[0] = '0';
1033 buffer[1] = 'x';
1034 }
1035 appendstring(buffer);
1036 break;
1037 case '%':
1038 *s++ = '%';
1039 break;
1040 default:
1041 appendstring(p);
1042 goto end;
1043 }
1044 } else
1045 *s++ = *f;
1046 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001047
Benjamin Peterson29060642009-01-31 22:14:21 +00001048 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001049 if (callresults)
1050 PyObject_Free(callresults);
1051 if (abuffer)
1052 PyObject_Free(abuffer);
1053 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1054 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001055 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 if (callresults) {
1057 PyObject **callresult2 = callresults;
1058 while (callresult2 < callresult) {
1059 Py_DECREF(*callresult2);
1060 ++callresult2;
1061 }
1062 PyObject_Free(callresults);
1063 }
1064 if (abuffer)
1065 PyObject_Free(abuffer);
1066 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001067}
1068
1069#undef appendstring
1070
1071PyObject *
1072PyUnicode_FromFormat(const char *format, ...)
1073{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001074 PyObject* ret;
1075 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001076
1077#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001079#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001080 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001081#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001082 ret = PyUnicode_FromFormatV(format, vargs);
1083 va_end(vargs);
1084 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001085}
1086
Martin v. Löwis18e16552006-02-15 17:27:45 +00001087Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001088 wchar_t *w,
1089 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090{
1091 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001092 PyErr_BadInternalCall();
1093 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001094 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001095
1096 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001098 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001099
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100#ifdef HAVE_USABLE_WCHAR_T
1101 memcpy(w, unicode->str, size * sizeof(wchar_t));
1102#else
1103 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001104 register Py_UNICODE *u;
1105 register Py_ssize_t i;
1106 u = PyUnicode_AS_UNICODE(unicode);
1107 for (i = size; i > 0; i--)
1108 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001109 }
1110#endif
1111
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001112 if (size > PyUnicode_GET_SIZE(unicode))
1113 return PyUnicode_GET_SIZE(unicode);
1114 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001115 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116}
1117
1118#endif
1119
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001120PyObject *PyUnicode_FromOrdinal(int ordinal)
1121{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001122 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001123
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001124 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 PyErr_SetString(PyExc_ValueError,
1126 "chr() arg not in range(0x110000)");
1127 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001128 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001129
1130#ifndef Py_UNICODE_WIDE
1131 if (ordinal > 0xffff) {
1132 ordinal -= 0x10000;
1133 s[0] = 0xD800 | (ordinal >> 10);
1134 s[1] = 0xDC00 | (ordinal & 0x3FF);
1135 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001136 }
1137#endif
1138
Hye-Shik Chang40574832004-04-06 07:24:51 +00001139 s[0] = (Py_UNICODE)ordinal;
1140 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001141}
1142
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143PyObject *PyUnicode_FromObject(register PyObject *obj)
1144{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001145 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001147 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001148 Py_INCREF(obj);
1149 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001150 }
1151 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001152 /* For a Unicode subtype that's not a Unicode object,
1153 return a true Unicode object with the same data. */
1154 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1155 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001156 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001157 PyErr_Format(PyExc_TypeError,
1158 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001159 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001160 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001161}
1162
1163PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 const char *encoding,
1165 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001166{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001167 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001168 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001169 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001170
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 PyErr_BadInternalCall();
1173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001175
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001176 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001177 PyErr_SetString(PyExc_TypeError,
1178 "decoding str is not supported");
1179 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001180 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001181
1182 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001183 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001184 s = PyBytes_AS_STRING(obj);
1185 len = PyBytes_GET_SIZE(obj);
1186 }
1187 else if (PyByteArray_Check(obj)) {
1188 s = PyByteArray_AS_STRING(obj);
1189 len = PyByteArray_GET_SIZE(obj);
1190 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001191 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001192 /* Overwrite the error message with something more useful in
1193 case of a TypeError. */
1194 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001195 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001196 "coercing to str: need string or buffer, "
1197 "%.80s found",
1198 Py_TYPE(obj)->tp_name);
1199 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001200 }
Tim Petersced69f82003-09-16 20:30:58 +00001201
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001202 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001204 Py_INCREF(unicode_empty);
1205 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 }
Tim Petersced69f82003-09-16 20:30:58 +00001207 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001208 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001209
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001210 return v;
1211
Benjamin Peterson29060642009-01-31 22:14:21 +00001212 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214}
1215
1216PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001217 Py_ssize_t size,
1218 const char *encoding,
1219 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220{
1221 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001222 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001223 char lower[20]; /* Enough for any encoding name we recognize */
1224 char *l;
1225 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001226
1227 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001228 encoding = PyUnicode_GetDefaultEncoding();
1229
1230 /* Convert encoding to lower case and replace '_' with '-' in order to
1231 catch e.g. UTF_8 */
1232 e = encoding;
1233 l = lower;
1234 while (*e && l < &lower[(sizeof lower) - 2]) {
1235 if (ISUPPER(*e)) {
1236 *l++ = TOLOWER(*e++);
1237 }
1238 else if (*e == '_') {
1239 *l++ = '-';
1240 e++;
1241 }
1242 else {
1243 *l++ = *e++;
1244 }
1245 }
1246 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001247
1248 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001249 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001251 else if ((strcmp(lower, "latin-1") == 0) ||
1252 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001253 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001254#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001255 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001256 return PyUnicode_DecodeMBCS(s, size, errors);
1257#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001258 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001259 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001260 else if (strcmp(lower, "utf-16") == 0)
1261 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1262 else if (strcmp(lower, "utf-32") == 0)
1263 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264
1265 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001266 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001267 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001268 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001269 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270 if (buffer == NULL)
1271 goto onError;
1272 unicode = PyCodec_Decode(buffer, encoding, errors);
1273 if (unicode == NULL)
1274 goto onError;
1275 if (!PyUnicode_Check(unicode)) {
1276 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001277 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001278 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 Py_DECREF(unicode);
1280 goto onError;
1281 }
1282 Py_DECREF(buffer);
1283 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001284
Benjamin Peterson29060642009-01-31 22:14:21 +00001285 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286 Py_XDECREF(buffer);
1287 return NULL;
1288}
1289
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001290PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1291 const char *encoding,
1292 const char *errors)
1293{
1294 PyObject *v;
1295
1296 if (!PyUnicode_Check(unicode)) {
1297 PyErr_BadArgument();
1298 goto onError;
1299 }
1300
1301 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001303
1304 /* Decode via the codec registry */
1305 v = PyCodec_Decode(unicode, encoding, errors);
1306 if (v == NULL)
1307 goto onError;
1308 return v;
1309
Benjamin Peterson29060642009-01-31 22:14:21 +00001310 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001311 return NULL;
1312}
1313
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001314PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1315 const char *encoding,
1316 const char *errors)
1317{
1318 PyObject *v;
1319
1320 if (!PyUnicode_Check(unicode)) {
1321 PyErr_BadArgument();
1322 goto onError;
1323 }
1324
1325 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001326 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001327
1328 /* Decode via the codec registry */
1329 v = PyCodec_Decode(unicode, encoding, errors);
1330 if (v == NULL)
1331 goto onError;
1332 if (!PyUnicode_Check(v)) {
1333 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001334 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001335 Py_TYPE(v)->tp_name);
1336 Py_DECREF(v);
1337 goto onError;
1338 }
1339 return v;
1340
Benjamin Peterson29060642009-01-31 22:14:21 +00001341 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001342 return NULL;
1343}
1344
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001346 Py_ssize_t size,
1347 const char *encoding,
1348 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349{
1350 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001351
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 unicode = PyUnicode_FromUnicode(s, size);
1353 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1356 Py_DECREF(unicode);
1357 return v;
1358}
1359
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001360PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1361 const char *encoding,
1362 const char *errors)
1363{
1364 PyObject *v;
1365
1366 if (!PyUnicode_Check(unicode)) {
1367 PyErr_BadArgument();
1368 goto onError;
1369 }
1370
1371 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001372 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001373
1374 /* Encode via the codec registry */
1375 v = PyCodec_Encode(unicode, encoding, errors);
1376 if (v == NULL)
1377 goto onError;
1378 return v;
1379
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001381 return NULL;
1382}
1383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1385 const char *encoding,
1386 const char *errors)
1387{
1388 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001389
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 if (!PyUnicode_Check(unicode)) {
1391 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 }
Fred Drakee4315f52000-05-09 19:53:39 +00001394
Tim Petersced69f82003-09-16 20:30:58 +00001395 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001396 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001397
1398 /* Shortcuts for common default encodings */
1399 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001400 if (strcmp(encoding, "utf-8") == 0)
1401 return PyUnicode_AsUTF8String(unicode);
1402 else if (strcmp(encoding, "latin-1") == 0)
1403 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001404#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001405 else if (strcmp(encoding, "mbcs") == 0)
1406 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001407#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001408 else if (strcmp(encoding, "ascii") == 0)
1409 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001410 /* During bootstrap, we may need to find the encodings
1411 package, to load the file system encoding, and require the
1412 file system encoding in order to load the encodings
1413 package.
1414
1415 Break out of this dependency by assuming that the path to
1416 the encodings module is ASCII-only. XXX could try wcstombs
1417 instead, if the file system encoding is the locale's
1418 encoding. */
1419 else if (Py_FileSystemDefaultEncoding &&
1420 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1421 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001422 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424
1425 /* Encode via the codec registry */
1426 v = PyCodec_Encode(unicode, encoding, errors);
1427 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001428 return NULL;
1429
1430 /* The normal path */
1431 if (PyBytes_Check(v))
1432 return v;
1433
1434 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001435 if (PyByteArray_Check(v)) {
1436 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001437 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001438 PyOS_snprintf(msg, sizeof(msg),
1439 "encoder %s returned buffer instead of bytes",
1440 encoding);
1441 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001442 Py_DECREF(v);
1443 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001444 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001445
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001446 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1447 Py_DECREF(v);
1448 return b;
1449 }
1450
1451 PyErr_Format(PyExc_TypeError,
1452 "encoder did not return a bytes object (type=%.400s)",
1453 Py_TYPE(v)->tp_name);
1454 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001455 return NULL;
1456}
1457
1458PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1459 const char *encoding,
1460 const char *errors)
1461{
1462 PyObject *v;
1463
1464 if (!PyUnicode_Check(unicode)) {
1465 PyErr_BadArgument();
1466 goto onError;
1467 }
1468
1469 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001470 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001471
1472 /* Encode via the codec registry */
1473 v = PyCodec_Encode(unicode, encoding, errors);
1474 if (v == NULL)
1475 goto onError;
1476 if (!PyUnicode_Check(v)) {
1477 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001478 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001479 Py_TYPE(v)->tp_name);
1480 Py_DECREF(v);
1481 goto onError;
1482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001484
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 return NULL;
1487}
1488
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001489PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001491{
1492 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001493 if (v)
1494 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001495 if (errors != NULL)
1496 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001497 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001498 PyUnicode_GET_SIZE(unicode),
1499 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001500 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001501 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001502 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001503 return v;
1504}
1505
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001506PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001507PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001508 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001509 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1510}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001511
Christian Heimes5894ba72007-11-04 11:43:14 +00001512PyObject*
1513PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1514{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001515 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1516 can be undefined. If it is case, decode using UTF-8. The following assumes
1517 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1518 bootstrapping process where the codecs aren't ready yet.
1519 */
1520 if (Py_FileSystemDefaultEncoding) {
1521#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001522 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001523 return PyUnicode_DecodeMBCS(s, size, "replace");
1524 }
1525#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001526 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001527 return PyUnicode_DecodeUTF8(s, size, "replace");
1528 }
1529#endif
1530 return PyUnicode_Decode(s, size,
1531 Py_FileSystemDefaultEncoding,
1532 "replace");
1533 }
1534 else {
1535 return PyUnicode_DecodeUTF8(s, size, "replace");
1536 }
1537}
1538
Martin v. Löwis011e8422009-05-05 04:43:17 +00001539/* Convert the argument to a bytes object, according to the file
1540 system encoding */
1541
1542int
1543PyUnicode_FSConverter(PyObject* arg, void* addr)
1544{
1545 PyObject *output = NULL;
1546 Py_ssize_t size;
1547 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001548 if (arg == NULL) {
1549 Py_DECREF(*(PyObject**)addr);
1550 return 1;
1551 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001552 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1553 output = arg;
1554 Py_INCREF(output);
1555 }
1556 else {
1557 arg = PyUnicode_FromObject(arg);
1558 if (!arg)
1559 return 0;
1560 output = PyUnicode_AsEncodedObject(arg,
1561 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001562 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001563 Py_DECREF(arg);
1564 if (!output)
1565 return 0;
1566 if (!PyBytes_Check(output)) {
1567 Py_DECREF(output);
1568 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1569 return 0;
1570 }
1571 }
1572 if (PyBytes_Check(output)) {
1573 size = PyBytes_GET_SIZE(output);
1574 data = PyBytes_AS_STRING(output);
1575 }
1576 else {
1577 size = PyByteArray_GET_SIZE(output);
1578 data = PyByteArray_AS_STRING(output);
1579 }
1580 if (size != strlen(data)) {
1581 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1582 Py_DECREF(output);
1583 return 0;
1584 }
1585 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001586 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001587}
1588
1589
Martin v. Löwis5b222132007-06-10 09:51:05 +00001590char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001591_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001592{
Christian Heimesf3863112007-11-22 07:46:41 +00001593 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001594 if (!PyUnicode_Check(unicode)) {
1595 PyErr_BadArgument();
1596 return NULL;
1597 }
Christian Heimesf3863112007-11-22 07:46:41 +00001598 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1599 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001600 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001601 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001602 *psize = PyBytes_GET_SIZE(bytes);
1603 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001604}
1605
1606char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001607_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001608{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001609 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001610}
1611
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1613{
1614 if (!PyUnicode_Check(unicode)) {
1615 PyErr_BadArgument();
1616 goto onError;
1617 }
1618 return PyUnicode_AS_UNICODE(unicode);
1619
Benjamin Peterson29060642009-01-31 22:14:21 +00001620 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 return NULL;
1622}
1623
Martin v. Löwis18e16552006-02-15 17:27:45 +00001624Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001625{
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630 return PyUnicode_GET_SIZE(unicode);
1631
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001633 return -1;
1634}
1635
Thomas Wouters78890102000-07-22 19:25:51 +00001636const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001637{
1638 return unicode_default_encoding;
1639}
1640
1641int PyUnicode_SetDefaultEncoding(const char *encoding)
1642{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001643 if (strcmp(encoding, unicode_default_encoding) != 0) {
1644 PyErr_Format(PyExc_ValueError,
1645 "Can only set default encoding to %s",
1646 unicode_default_encoding);
1647 return -1;
1648 }
Fred Drakee4315f52000-05-09 19:53:39 +00001649 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001650}
1651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001652/* error handling callback helper:
1653 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001654 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001655 and adjust various state variables.
1656 return 0 on success, -1 on error
1657*/
1658
1659static
1660int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001661 const char *encoding, const char *reason,
1662 const char **input, const char **inend, Py_ssize_t *startinpos,
1663 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1664 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001665{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001666 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001667
1668 PyObject *restuple = NULL;
1669 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001670 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001671 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001672 Py_ssize_t requiredsize;
1673 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001674 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001675 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001676 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001677 int res = -1;
1678
1679 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001680 *errorHandler = PyCodec_LookupError(errors);
1681 if (*errorHandler == NULL)
1682 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001683 }
1684
1685 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001686 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001687 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1688 if (*exceptionObject == NULL)
1689 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001690 }
1691 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001692 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1693 goto onError;
1694 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1695 goto onError;
1696 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1697 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001698 }
1699
1700 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1701 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001703 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001704 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001705 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001706 }
1707 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001708 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001709
1710 /* Copy back the bytes variables, which might have been modified by the
1711 callback */
1712 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1713 if (!inputobj)
1714 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001715 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001716 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001717 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001718 *input = PyBytes_AS_STRING(inputobj);
1719 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001720 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001721 /* we can DECREF safely, as the exception has another reference,
1722 so the object won't go away. */
1723 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001725 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001726 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001727 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001728 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1729 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001731
1732 /* need more space? (at least enough for what we
1733 have+the replacement+the rest of the string (starting
1734 at the new input position), so we won't have to check space
1735 when there are no errors in the rest of the string) */
1736 repptr = PyUnicode_AS_UNICODE(repunicode);
1737 repsize = PyUnicode_GET_SIZE(repunicode);
1738 requiredsize = *outpos + repsize + insize-newpos;
1739 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001740 if (requiredsize<2*outsize)
1741 requiredsize = 2*outsize;
1742 if (_PyUnicode_Resize(output, requiredsize) < 0)
1743 goto onError;
1744 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001745 }
1746 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001747 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001748 Py_UNICODE_COPY(*outptr, repptr, repsize);
1749 *outptr += repsize;
1750 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 /* we made it! */
1753 res = 0;
1754
Benjamin Peterson29060642009-01-31 22:14:21 +00001755 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 Py_XDECREF(restuple);
1757 return res;
1758}
1759
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001760/* --- UTF-7 Codec -------------------------------------------------------- */
1761
Antoine Pitrou244651a2009-05-04 18:56:13 +00001762/* See RFC2152 for details. We encode conservatively and decode liberally. */
1763
1764/* Three simple macros defining base-64. */
1765
1766/* Is c a base-64 character? */
1767
1768#define IS_BASE64(c) \
1769 (((c) >= 'A' && (c) <= 'Z') || \
1770 ((c) >= 'a' && (c) <= 'z') || \
1771 ((c) >= '0' && (c) <= '9') || \
1772 (c) == '+' || (c) == '/')
1773
1774/* given that c is a base-64 character, what is its base-64 value? */
1775
1776#define FROM_BASE64(c) \
1777 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1778 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1779 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1780 (c) == '+' ? 62 : 63)
1781
1782/* What is the base-64 character of the bottom 6 bits of n? */
1783
1784#define TO_BASE64(n) \
1785 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1786
1787/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1788 * decoded as itself. We are permissive on decoding; the only ASCII
1789 * byte not decoding to itself is the + which begins a base64
1790 * string. */
1791
1792#define DECODE_DIRECT(c) \
1793 ((c) <= 127 && (c) != '+')
1794
1795/* The UTF-7 encoder treats ASCII characters differently according to
1796 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1797 * the above). See RFC2152. This array identifies these different
1798 * sets:
1799 * 0 : "Set D"
1800 * alphanumeric and '(),-./:?
1801 * 1 : "Set O"
1802 * !"#$%&*;<=>@[]^_`{|}
1803 * 2 : "whitespace"
1804 * ht nl cr sp
1805 * 3 : special (must be base64 encoded)
1806 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1807 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001808
Tim Petersced69f82003-09-16 20:30:58 +00001809static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001810char utf7_category[128] = {
1811/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1812 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1813/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1814 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1815/* sp ! " # $ % & ' ( ) * + , - . / */
1816 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1817/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1819/* @ A B C D E F G H I J K L M N O */
1820 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1821/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1823/* ` a b c d e f g h i j k l m n o */
1824 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1825/* p q r s t u v w x y z { | } ~ del */
1826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001827};
1828
Antoine Pitrou244651a2009-05-04 18:56:13 +00001829/* ENCODE_DIRECT: this character should be encoded as itself. The
1830 * answer depends on whether we are encoding set O as itself, and also
1831 * on whether we are encoding whitespace as itself. RFC2152 makes it
1832 * clear that the answers to these questions vary between
1833 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001834
Antoine Pitrou244651a2009-05-04 18:56:13 +00001835#define ENCODE_DIRECT(c, directO, directWS) \
1836 ((c) < 128 && (c) > 0 && \
1837 ((utf7_category[(c)] == 0) || \
1838 (directWS && (utf7_category[(c)] == 2)) || \
1839 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001840
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001841PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001842 Py_ssize_t size,
1843 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001845 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1846}
1847
Antoine Pitrou244651a2009-05-04 18:56:13 +00001848/* The decoder. The only state we preserve is our read position,
1849 * i.e. how many characters we have consumed. So if we end in the
1850 * middle of a shift sequence we have to back off the read position
1851 * and the output to the beginning of the sequence, otherwise we lose
1852 * all the shift state (seen bits, number of bits seen, high
1853 * surrogate). */
1854
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001855PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 Py_ssize_t size,
1857 const char *errors,
1858 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001860 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001861 Py_ssize_t startinpos;
1862 Py_ssize_t endinpos;
1863 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001864 const char *e;
1865 PyUnicodeObject *unicode;
1866 Py_UNICODE *p;
1867 const char *errmsg = "";
1868 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001869 Py_UNICODE *shiftOutStart;
1870 unsigned int base64bits = 0;
1871 unsigned long base64buffer = 0;
1872 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001873 PyObject *errorHandler = NULL;
1874 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001875
1876 unicode = _PyUnicode_New(size);
1877 if (!unicode)
1878 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001879 if (size == 0) {
1880 if (consumed)
1881 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001882 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001883 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001884
1885 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001886 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001887 e = s + size;
1888
1889 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001890 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001891 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001892 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001893
Antoine Pitrou244651a2009-05-04 18:56:13 +00001894 if (inShift) { /* in a base-64 section */
1895 if (IS_BASE64(ch)) { /* consume a base-64 character */
1896 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1897 base64bits += 6;
1898 s++;
1899 if (base64bits >= 16) {
1900 /* we have enough bits for a UTF-16 value */
1901 Py_UNICODE outCh = (Py_UNICODE)
1902 (base64buffer >> (base64bits-16));
1903 base64bits -= 16;
1904 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1905 if (surrogate) {
1906 /* expecting a second surrogate */
1907 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1908#ifdef Py_UNICODE_WIDE
1909 *p++ = (((surrogate & 0x3FF)<<10)
1910 | (outCh & 0x3FF)) + 0x10000;
1911#else
1912 *p++ = surrogate;
1913 *p++ = outCh;
1914#endif
1915 surrogate = 0;
1916 }
1917 else {
1918 surrogate = 0;
1919 errmsg = "second surrogate missing";
1920 goto utf7Error;
1921 }
1922 }
1923 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1924 /* first surrogate */
1925 surrogate = outCh;
1926 }
1927 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1928 errmsg = "unexpected second surrogate";
1929 goto utf7Error;
1930 }
1931 else {
1932 *p++ = outCh;
1933 }
1934 }
1935 }
1936 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001937 inShift = 0;
1938 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001939 if (surrogate) {
1940 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001941 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001942 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001943 if (base64bits > 0) { /* left-over bits */
1944 if (base64bits >= 6) {
1945 /* We've seen at least one base-64 character */
1946 errmsg = "partial character in shift sequence";
1947 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001948 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001949 else {
1950 /* Some bits remain; they should be zero */
1951 if (base64buffer != 0) {
1952 errmsg = "non-zero padding bits in shift sequence";
1953 goto utf7Error;
1954 }
1955 }
1956 }
1957 if (ch != '-') {
1958 /* '-' is absorbed; other terminating
1959 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001960 *p++ = ch;
1961 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962 }
1963 }
1964 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001966 s++; /* consume '+' */
1967 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001968 s++;
1969 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00001970 }
1971 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001972 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001973 shiftOutStart = p;
1974 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001975 }
1976 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001977 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001978 *p++ = ch;
1979 s++;
1980 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00001981 else {
1982 startinpos = s-starts;
1983 s++;
1984 errmsg = "unexpected special character";
1985 goto utf7Error;
1986 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001987 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001988utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 outpos = p-PyUnicode_AS_UNICODE(unicode);
1990 endinpos = s-starts;
1991 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00001992 errors, &errorHandler,
1993 "utf7", errmsg,
1994 &starts, &e, &startinpos, &endinpos, &exc, &s,
1995 &unicode, &outpos, &p))
1996 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001997 }
1998
Antoine Pitrou244651a2009-05-04 18:56:13 +00001999 /* end of string */
2000
2001 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2002 /* if we're in an inconsistent state, that's an error */
2003 if (surrogate ||
2004 (base64bits >= 6) ||
2005 (base64bits > 0 && base64buffer != 0)) {
2006 outpos = p-PyUnicode_AS_UNICODE(unicode);
2007 endinpos = size;
2008 if (unicode_decode_call_errorhandler(
2009 errors, &errorHandler,
2010 "utf7", "unterminated shift sequence",
2011 &starts, &e, &startinpos, &endinpos, &exc, &s,
2012 &unicode, &outpos, &p))
2013 goto onError;
2014 if (s < e)
2015 goto restart;
2016 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002017 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002018
2019 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002020 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002021 if (inShift) {
2022 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002023 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002024 }
2025 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002026 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002027 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002028 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002030 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002031 goto onError;
2032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 Py_XDECREF(errorHandler);
2034 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002035 return (PyObject *)unicode;
2036
Benjamin Peterson29060642009-01-31 22:14:21 +00002037 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 Py_XDECREF(errorHandler);
2039 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002040 Py_DECREF(unicode);
2041 return NULL;
2042}
2043
2044
2045PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002046 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002047 int base64SetO,
2048 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002050{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002051 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002052 /* It might be possible to tighten this worst case */
Georg Brandl194da4a2009-08-13 09:34:05 +00002053 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002054 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002055 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002056 unsigned int base64bits = 0;
2057 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002058 char * out;
2059 char * start;
2060
2061 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002062 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002063
Georg Brandl194da4a2009-08-13 09:34:05 +00002064 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002065 return PyErr_NoMemory();
2066
Antoine Pitrou244651a2009-05-04 18:56:13 +00002067 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002068 if (v == NULL)
2069 return NULL;
2070
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002071 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072 for (;i < size; ++i) {
2073 Py_UNICODE ch = s[i];
2074
Antoine Pitrou244651a2009-05-04 18:56:13 +00002075 if (inShift) {
2076 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2077 /* shifting out */
2078 if (base64bits) { /* output remaining bits */
2079 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2080 base64buffer = 0;
2081 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002082 }
2083 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002084 /* Characters not in the BASE64 set implicitly unshift the sequence
2085 so no '-' is required, except if the character is itself a '-' */
2086 if (IS_BASE64(ch) || ch == '-') {
2087 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002088 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 *out++ = (char) ch;
2090 }
2091 else {
2092 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002093 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002094 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002095 else { /* not in a shift sequence */
2096 if (ch == '+') {
2097 *out++ = '+';
2098 *out++ = '-';
2099 }
2100 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2101 *out++ = (char) ch;
2102 }
2103 else {
2104 *out++ = '+';
2105 inShift = 1;
2106 goto encode_char;
2107 }
2108 }
2109 continue;
2110encode_char:
2111#ifdef Py_UNICODE_WIDE
2112 if (ch >= 0x10000) {
2113 /* code first surrogate */
2114 base64bits += 16;
2115 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2116 while (base64bits >= 6) {
2117 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2118 base64bits -= 6;
2119 }
2120 /* prepare second surrogate */
2121 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2122 }
2123#endif
2124 base64bits += 16;
2125 base64buffer = (base64buffer << 16) | ch;
2126 while (base64bits >= 6) {
2127 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2128 base64bits -= 6;
2129 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002130 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002131 if (base64bits)
2132 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2133 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002134 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002135 if (_PyBytes_Resize(&v, out - start) < 0)
2136 return NULL;
2137 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002138}
2139
Antoine Pitrou244651a2009-05-04 18:56:13 +00002140#undef IS_BASE64
2141#undef FROM_BASE64
2142#undef TO_BASE64
2143#undef DECODE_DIRECT
2144#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146/* --- UTF-8 Codec -------------------------------------------------------- */
2147
Tim Petersced69f82003-09-16 20:30:58 +00002148static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149char utf8_code_length[256] = {
2150 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2151 illegal prefix. see RFC 2279 for details */
2152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2159 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2162 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2163 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2164 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2165 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2166 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2167 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2168};
2169
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002171 Py_ssize_t size,
2172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173{
Walter Dörwald69652032004-09-07 20:24:22 +00002174 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2175}
2176
Antoine Pitrouab868312009-01-10 15:40:25 +00002177/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2178#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2179
2180/* Mask to quickly check whether a C 'long' contains a
2181 non-ASCII, UTF8-encoded char. */
2182#if (SIZEOF_LONG == 8)
2183# define ASCII_CHAR_MASK 0x8080808080808080L
2184#elif (SIZEOF_LONG == 4)
2185# define ASCII_CHAR_MASK 0x80808080L
2186#else
2187# error C 'long' size should be either 4 or 8!
2188#endif
2189
Walter Dörwald69652032004-09-07 20:24:22 +00002190PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002191 Py_ssize_t size,
2192 const char *errors,
2193 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002194{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002197 Py_ssize_t startinpos;
2198 Py_ssize_t endinpos;
2199 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002200 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 PyUnicodeObject *unicode;
2202 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002203 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002204 PyObject *errorHandler = NULL;
2205 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206
2207 /* Note: size will always be longer than the resulting Unicode
2208 character count */
2209 unicode = _PyUnicode_New(size);
2210 if (!unicode)
2211 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002212 if (size == 0) {
2213 if (consumed)
2214 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217
2218 /* Unpack UTF-8 encoded data */
2219 p = unicode->str;
2220 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002221 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222
2223 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002224 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225
2226 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002227 /* Fast path for runs of ASCII characters. Given that common UTF-8
2228 input will consist of an overwhelming majority of ASCII
2229 characters, we try to optimize for this case by checking
2230 as many characters as a C 'long' can contain.
2231 First, check if we can do an aligned read, as most CPUs have
2232 a penalty for unaligned reads.
2233 */
2234 if (!((size_t) s & LONG_PTR_MASK)) {
2235 /* Help register allocation */
2236 register const char *_s = s;
2237 register Py_UNICODE *_p = p;
2238 while (_s < aligned_end) {
2239 /* Read a whole long at a time (either 4 or 8 bytes),
2240 and do a fast unrolled copy if it only contains ASCII
2241 characters. */
2242 unsigned long data = *(unsigned long *) _s;
2243 if (data & ASCII_CHAR_MASK)
2244 break;
2245 _p[0] = (unsigned char) _s[0];
2246 _p[1] = (unsigned char) _s[1];
2247 _p[2] = (unsigned char) _s[2];
2248 _p[3] = (unsigned char) _s[3];
2249#if (SIZEOF_LONG == 8)
2250 _p[4] = (unsigned char) _s[4];
2251 _p[5] = (unsigned char) _s[5];
2252 _p[6] = (unsigned char) _s[6];
2253 _p[7] = (unsigned char) _s[7];
2254#endif
2255 _s += SIZEOF_LONG;
2256 _p += SIZEOF_LONG;
2257 }
2258 s = _s;
2259 p = _p;
2260 if (s == e)
2261 break;
2262 ch = (unsigned char)*s;
2263 }
2264 }
2265
2266 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002267 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 s++;
2269 continue;
2270 }
2271
2272 n = utf8_code_length[ch];
2273
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002274 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 if (consumed)
2276 break;
2277 else {
2278 errmsg = "unexpected end of data";
2279 startinpos = s-starts;
2280 endinpos = size;
2281 goto utf8Error;
2282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 switch (n) {
2286
2287 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002288 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 startinpos = s-starts;
2290 endinpos = startinpos+1;
2291 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292
2293 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002294 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002295 startinpos = s-starts;
2296 endinpos = startinpos+1;
2297 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298
2299 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002300 if ((s[1] & 0xc0) != 0x80) {
2301 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 startinpos = s-starts;
2303 endinpos = startinpos+2;
2304 goto utf8Error;
2305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002307 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002308 startinpos = s-starts;
2309 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002310 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002311 goto utf8Error;
2312 }
2313 else
2314 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 break;
2316
2317 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002318 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002319 (s[2] & 0xc0) != 0x80) {
2320 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002321 startinpos = s-starts;
2322 endinpos = startinpos+3;
2323 goto utf8Error;
2324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002326 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002327 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002328 startinpos = s-starts;
2329 endinpos = startinpos+3;
2330 goto utf8Error;
2331 }
2332 else
2333 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002334 break;
2335
2336 case 4:
2337 if ((s[1] & 0xc0) != 0x80 ||
2338 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002339 (s[3] & 0xc0) != 0x80) {
2340 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002341 startinpos = s-starts;
2342 endinpos = startinpos+4;
2343 goto utf8Error;
2344 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002345 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002346 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002347 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002348 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002349 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002350 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002351 UTF-16 */
2352 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002353 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002354 startinpos = s-starts;
2355 endinpos = startinpos+4;
2356 goto utf8Error;
2357 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002358#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002359 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002360#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002361 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002362
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002363 /* translate from 10000..10FFFF to 0..FFFF */
2364 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002365
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002366 /* high surrogate = top 10 bits added to D800 */
2367 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002368
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002369 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002370 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002371#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 break;
2373
2374 default:
2375 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002376 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002377 startinpos = s-starts;
2378 endinpos = startinpos+n;
2379 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002380 }
2381 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002382 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002383
Benjamin Peterson29060642009-01-31 22:14:21 +00002384 utf8Error:
2385 outpos = p-PyUnicode_AS_UNICODE(unicode);
2386 if (unicode_decode_call_errorhandler(
2387 errors, &errorHandler,
2388 "utf8", errmsg,
2389 &starts, &e, &startinpos, &endinpos, &exc, &s,
2390 &unicode, &outpos, &p))
2391 goto onError;
2392 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 }
Walter Dörwald69652032004-09-07 20:24:22 +00002394 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002395 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396
2397 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002398 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 goto onError;
2400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 Py_XDECREF(errorHandler);
2402 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 return (PyObject *)unicode;
2404
Benjamin Peterson29060642009-01-31 22:14:21 +00002405 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406 Py_XDECREF(errorHandler);
2407 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 Py_DECREF(unicode);
2409 return NULL;
2410}
2411
Antoine Pitrouab868312009-01-10 15:40:25 +00002412#undef ASCII_CHAR_MASK
2413
2414
Tim Peters602f7402002-04-27 18:03:26 +00002415/* Allocation strategy: if the string is short, convert into a stack buffer
2416 and allocate exactly as much space needed at the end. Else allocate the
2417 maximum possible needed (4 result bytes per Unicode character), and return
2418 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002419*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002420PyObject *
2421PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002422 Py_ssize_t size,
2423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424{
Tim Peters602f7402002-04-27 18:03:26 +00002425#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002426
Guido van Rossum98297ee2007-11-06 21:34:58 +00002427 Py_ssize_t i; /* index into s of next input byte */
2428 PyObject *result; /* result string object */
2429 char *p; /* next free byte in output buffer */
2430 Py_ssize_t nallocated; /* number of result bytes allocated */
2431 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002432 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002433 PyObject *errorHandler = NULL;
2434 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002435
Tim Peters602f7402002-04-27 18:03:26 +00002436 assert(s != NULL);
2437 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438
Tim Peters602f7402002-04-27 18:03:26 +00002439 if (size <= MAX_SHORT_UNICHARS) {
2440 /* Write into the stack buffer; nallocated can't overflow.
2441 * At the end, we'll allocate exactly as much heap space as it
2442 * turns out we need.
2443 */
2444 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002445 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002446 p = stackbuf;
2447 }
2448 else {
2449 /* Overallocate on the heap, and give the excess back at the end. */
2450 nallocated = size * 4;
2451 if (nallocated / 4 != size) /* overflow! */
2452 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002453 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002454 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002455 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002456 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002457 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002458
Tim Peters602f7402002-04-27 18:03:26 +00002459 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002460 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002461
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002462 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002463 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002465
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002467 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002468 *p++ = (char)(0xc0 | (ch >> 6));
2469 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner158701d2010-04-22 19:41:01 +00002470 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002471#ifndef Py_UNICODE_WIDE
Victor Stinner158701d2010-04-22 19:41:01 +00002472 /* Special case: check for high and low surrogate */
2473 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2474 Py_UCS4 ch2 = s[i];
2475 /* Combine the two surrogates to form a UCS4 value */
2476 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2477 i++;
2478
2479 /* Encode UCS4 Unicode ordinals */
2480 *p++ = (char)(0xf0 | (ch >> 18));
2481 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002482 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2483 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner158701d2010-04-22 19:41:01 +00002484 } else {
Victor Stinner0b79b762010-04-22 20:07:28 +00002485#endif
Victor Stinner158701d2010-04-22 19:41:01 +00002486 Py_ssize_t newpos;
2487 PyObject *rep;
2488 Py_ssize_t repsize, k;
2489 rep = unicode_encode_call_errorhandler
2490 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2491 s, size, &exc, i-1, i, &newpos);
2492 if (!rep)
2493 goto error;
2494
2495 if (PyBytes_Check(rep))
2496 repsize = PyBytes_GET_SIZE(rep);
2497 else
2498 repsize = PyUnicode_GET_SIZE(rep);
2499
2500 if (repsize > 4) {
2501 Py_ssize_t offset;
2502
2503 if (result == NULL)
2504 offset = p - stackbuf;
2505 else
2506 offset = p - PyBytes_AS_STRING(result);
2507
2508 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2509 /* integer overflow */
2510 PyErr_NoMemory();
2511 goto error;
2512 }
2513 nallocated += repsize - 4;
2514 if (result != NULL) {
2515 if (_PyBytes_Resize(&result, nallocated) < 0)
2516 goto error;
2517 } else {
2518 result = PyBytes_FromStringAndSize(NULL, nallocated);
2519 if (result == NULL)
2520 goto error;
2521 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2522 }
2523 p = PyBytes_AS_STRING(result) + offset;
2524 }
2525
2526 if (PyBytes_Check(rep)) {
2527 char *prep = PyBytes_AS_STRING(rep);
2528 for(k = repsize; k > 0; k--)
2529 *p++ = *prep++;
2530 } else /* rep is unicode */ {
2531 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2532 Py_UNICODE c;
2533
2534 for(k=0; k<repsize; k++) {
2535 c = prep[k];
2536 if (0x80 <= c) {
2537 raise_encode_exception(&exc, "utf-8", s, size,
2538 i-1, i, "surrogates not allowed");
2539 goto error;
2540 }
2541 *p++ = (char)prep[k];
2542 }
2543 }
2544 Py_DECREF(rep);
Victor Stinner0b79b762010-04-22 20:07:28 +00002545#ifndef Py_UNICODE_WIDE
Victor Stinner158701d2010-04-22 19:41:01 +00002546 }
Victor Stinner0b79b762010-04-22 20:07:28 +00002547#endif
Victor Stinner158701d2010-04-22 19:41:01 +00002548 } else if (ch < 0x10000) {
2549 *p++ = (char)(0xe0 | (ch >> 12));
2550 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2551 *p++ = (char)(0x80 | (ch & 0x3f));
2552 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002553 /* Encode UCS4 Unicode ordinals */
2554 *p++ = (char)(0xf0 | (ch >> 18));
2555 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2556 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2557 *p++ = (char)(0x80 | (ch & 0x3f));
2558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002560
Guido van Rossum98297ee2007-11-06 21:34:58 +00002561 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002562 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002563 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002564 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002565 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002566 }
2567 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002568 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002569 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002570 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002571 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002572 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002573 Py_XDECREF(errorHandler);
2574 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002575 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002576 error:
2577 Py_XDECREF(errorHandler);
2578 Py_XDECREF(exc);
2579 Py_XDECREF(result);
2580 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002581
Tim Peters602f7402002-04-27 18:03:26 +00002582#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583}
2584
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2586{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 if (!PyUnicode_Check(unicode)) {
2588 PyErr_BadArgument();
2589 return NULL;
2590 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002591 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002592 PyUnicode_GET_SIZE(unicode),
2593 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594}
2595
Walter Dörwald41980ca2007-08-16 21:55:45 +00002596/* --- UTF-32 Codec ------------------------------------------------------- */
2597
2598PyObject *
2599PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002600 Py_ssize_t size,
2601 const char *errors,
2602 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002603{
2604 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2605}
2606
2607PyObject *
2608PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002609 Py_ssize_t size,
2610 const char *errors,
2611 int *byteorder,
2612 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002613{
2614 const char *starts = s;
2615 Py_ssize_t startinpos;
2616 Py_ssize_t endinpos;
2617 Py_ssize_t outpos;
2618 PyUnicodeObject *unicode;
2619 Py_UNICODE *p;
2620#ifndef Py_UNICODE_WIDE
Antoine Pitrou6107a682010-06-11 21:48:34 +00002621 int pairs = 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002622#else
2623 const int pairs = 0;
2624#endif
Antoine Pitrou6107a682010-06-11 21:48:34 +00002625 const unsigned char *q, *e, *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002626 int bo = 0; /* assume native ordering by default */
2627 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002628 /* Offsets from q for retrieving bytes in the right order. */
2629#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2630 int iorder[] = {0, 1, 2, 3};
2631#else
2632 int iorder[] = {3, 2, 1, 0};
2633#endif
2634 PyObject *errorHandler = NULL;
2635 PyObject *exc = NULL;
Antoine Pitrou6107a682010-06-11 21:48:34 +00002636
Walter Dörwald41980ca2007-08-16 21:55:45 +00002637 q = (unsigned char *)s;
2638 e = q + size;
2639
2640 if (byteorder)
2641 bo = *byteorder;
2642
2643 /* Check for BOM marks (U+FEFF) in the input and adjust current
2644 byte order setting accordingly. In native mode, the leading BOM
2645 mark is skipped, in all other modes, it is copied to the output
2646 stream as-is (giving a ZWNBSP character). */
2647 if (bo == 0) {
2648 if (size >= 4) {
2649 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002650 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002651#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 if (bom == 0x0000FEFF) {
2653 q += 4;
2654 bo = -1;
2655 }
2656 else if (bom == 0xFFFE0000) {
2657 q += 4;
2658 bo = 1;
2659 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002660#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002661 if (bom == 0x0000FEFF) {
2662 q += 4;
2663 bo = 1;
2664 }
2665 else if (bom == 0xFFFE0000) {
2666 q += 4;
2667 bo = -1;
2668 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002669#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002670 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002671 }
2672
2673 if (bo == -1) {
2674 /* force LE */
2675 iorder[0] = 0;
2676 iorder[1] = 1;
2677 iorder[2] = 2;
2678 iorder[3] = 3;
2679 }
2680 else if (bo == 1) {
2681 /* force BE */
2682 iorder[0] = 3;
2683 iorder[1] = 2;
2684 iorder[2] = 1;
2685 iorder[3] = 0;
2686 }
2687
Antoine Pitrou6107a682010-06-11 21:48:34 +00002688 /* On narrow builds we split characters outside the BMP into two
2689 codepoints => count how much extra space we need. */
2690#ifndef Py_UNICODE_WIDE
2691 for (qq = q; qq < e; qq += 4)
2692 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2693 pairs++;
2694#endif
2695
2696 /* This might be one to much, because of a BOM */
2697 unicode = _PyUnicode_New((size+3)/4+pairs);
2698 if (!unicode)
2699 return NULL;
2700 if (size == 0)
2701 return (PyObject *)unicode;
2702
2703 /* Unpack UTF-32 encoded data */
2704 p = unicode->str;
2705
Walter Dörwald41980ca2007-08-16 21:55:45 +00002706 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 Py_UCS4 ch;
2708 /* remaining bytes at the end? (size should be divisible by 4) */
2709 if (e-q<4) {
2710 if (consumed)
2711 break;
2712 errmsg = "truncated data";
2713 startinpos = ((const char *)q)-starts;
2714 endinpos = ((const char *)e)-starts;
2715 goto utf32Error;
2716 /* The remaining input chars are ignored if the callback
2717 chooses to skip the input */
2718 }
2719 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2720 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002721
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 if (ch >= 0x110000)
2723 {
2724 errmsg = "codepoint not in range(0x110000)";
2725 startinpos = ((const char *)q)-starts;
2726 endinpos = startinpos+4;
2727 goto utf32Error;
2728 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002729#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002730 if (ch >= 0x10000)
2731 {
2732 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2733 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2734 }
2735 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002736#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 *p++ = ch;
2738 q += 4;
2739 continue;
2740 utf32Error:
2741 outpos = p-PyUnicode_AS_UNICODE(unicode);
2742 if (unicode_decode_call_errorhandler(
2743 errors, &errorHandler,
2744 "utf32", errmsg,
2745 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2746 &unicode, &outpos, &p))
2747 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002748 }
2749
2750 if (byteorder)
2751 *byteorder = bo;
2752
2753 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002755
2756 /* Adjust length */
2757 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2758 goto onError;
2759
2760 Py_XDECREF(errorHandler);
2761 Py_XDECREF(exc);
2762 return (PyObject *)unicode;
2763
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002765 Py_DECREF(unicode);
2766 Py_XDECREF(errorHandler);
2767 Py_XDECREF(exc);
2768 return NULL;
2769}
2770
2771PyObject *
2772PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 Py_ssize_t size,
2774 const char *errors,
2775 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002776{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002777 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002778 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002779 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002781 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002782#else
2783 const int pairs = 0;
2784#endif
2785 /* Offsets from p for storing byte pairs in the right order. */
2786#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2787 int iorder[] = {0, 1, 2, 3};
2788#else
2789 int iorder[] = {3, 2, 1, 0};
2790#endif
2791
Benjamin Peterson29060642009-01-31 22:14:21 +00002792#define STORECHAR(CH) \
2793 do { \
2794 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2795 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2796 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2797 p[iorder[0]] = (CH) & 0xff; \
2798 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002799 } while(0)
2800
2801 /* In narrow builds we can output surrogate pairs as one codepoint,
2802 so we need less space. */
2803#ifndef Py_UNICODE_WIDE
2804 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2806 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2807 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002808#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002809 nsize = (size - pairs + (byteorder == 0));
2810 bytesize = nsize * 4;
2811 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002813 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002814 if (v == NULL)
2815 return NULL;
2816
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002817 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002818 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002820 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002821 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002822
2823 if (byteorder == -1) {
2824 /* force LE */
2825 iorder[0] = 0;
2826 iorder[1] = 1;
2827 iorder[2] = 2;
2828 iorder[3] = 3;
2829 }
2830 else if (byteorder == 1) {
2831 /* force BE */
2832 iorder[0] = 3;
2833 iorder[1] = 2;
2834 iorder[2] = 1;
2835 iorder[3] = 0;
2836 }
2837
2838 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002840#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2842 Py_UCS4 ch2 = *s;
2843 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2844 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2845 s++;
2846 size--;
2847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002848 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002849#endif
2850 STORECHAR(ch);
2851 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002852
2853 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002854 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002855#undef STORECHAR
2856}
2857
2858PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2859{
2860 if (!PyUnicode_Check(unicode)) {
2861 PyErr_BadArgument();
2862 return NULL;
2863 }
2864 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002865 PyUnicode_GET_SIZE(unicode),
2866 NULL,
2867 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002868}
2869
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870/* --- UTF-16 Codec ------------------------------------------------------- */
2871
Tim Peters772747b2001-08-09 22:21:55 +00002872PyObject *
2873PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 Py_ssize_t size,
2875 const char *errors,
2876 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877{
Walter Dörwald69652032004-09-07 20:24:22 +00002878 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2879}
2880
Antoine Pitrouab868312009-01-10 15:40:25 +00002881/* Two masks for fast checking of whether a C 'long' may contain
2882 UTF16-encoded surrogate characters. This is an efficient heuristic,
2883 assuming that non-surrogate characters with a code point >= 0x8000 are
2884 rare in most input.
2885 FAST_CHAR_MASK is used when the input is in native byte ordering,
2886 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002887*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002888#if (SIZEOF_LONG == 8)
2889# define FAST_CHAR_MASK 0x8000800080008000L
2890# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2891#elif (SIZEOF_LONG == 4)
2892# define FAST_CHAR_MASK 0x80008000L
2893# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2894#else
2895# error C 'long' size should be either 4 or 8!
2896#endif
2897
Walter Dörwald69652032004-09-07 20:24:22 +00002898PyObject *
2899PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002900 Py_ssize_t size,
2901 const char *errors,
2902 int *byteorder,
2903 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002905 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002906 Py_ssize_t startinpos;
2907 Py_ssize_t endinpos;
2908 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 PyUnicodeObject *unicode;
2910 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002911 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002912 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002913 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002914 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002915 /* Offsets from q for retrieving byte pairs in the right order. */
2916#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2917 int ihi = 1, ilo = 0;
2918#else
2919 int ihi = 0, ilo = 1;
2920#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002921 PyObject *errorHandler = NULL;
2922 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923
2924 /* Note: size will always be longer than the resulting Unicode
2925 character count */
2926 unicode = _PyUnicode_New(size);
2927 if (!unicode)
2928 return NULL;
2929 if (size == 0)
2930 return (PyObject *)unicode;
2931
2932 /* Unpack UTF-16 encoded data */
2933 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002934 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002935 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936
2937 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002938 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002940 /* Check for BOM marks (U+FEFF) in the input and adjust current
2941 byte order setting accordingly. In native mode, the leading BOM
2942 mark is skipped, in all other modes, it is copied to the output
2943 stream as-is (giving a ZWNBSP character). */
2944 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002945 if (size >= 2) {
2946 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002947#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 if (bom == 0xFEFF) {
2949 q += 2;
2950 bo = -1;
2951 }
2952 else if (bom == 0xFFFE) {
2953 q += 2;
2954 bo = 1;
2955 }
Tim Petersced69f82003-09-16 20:30:58 +00002956#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002957 if (bom == 0xFEFF) {
2958 q += 2;
2959 bo = 1;
2960 }
2961 else if (bom == 0xFFFE) {
2962 q += 2;
2963 bo = -1;
2964 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002965#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002966 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
Tim Peters772747b2001-08-09 22:21:55 +00002969 if (bo == -1) {
2970 /* force LE */
2971 ihi = 1;
2972 ilo = 0;
2973 }
2974 else if (bo == 1) {
2975 /* force BE */
2976 ihi = 0;
2977 ilo = 1;
2978 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002979#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2980 native_ordering = ilo < ihi;
2981#else
2982 native_ordering = ilo > ihi;
2983#endif
Tim Peters772747b2001-08-09 22:21:55 +00002984
Antoine Pitrouab868312009-01-10 15:40:25 +00002985 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002986 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002987 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002988 /* First check for possible aligned read of a C 'long'. Unaligned
2989 reads are more expensive, better to defer to another iteration. */
2990 if (!((size_t) q & LONG_PTR_MASK)) {
2991 /* Fast path for runs of non-surrogate chars. */
2992 register const unsigned char *_q = q;
2993 Py_UNICODE *_p = p;
2994 if (native_ordering) {
2995 /* Native ordering is simple: as long as the input cannot
2996 possibly contain a surrogate char, do an unrolled copy
2997 of several 16-bit code points to the target object.
2998 The non-surrogate check is done on several input bytes
2999 at a time (as many as a C 'long' can contain). */
3000 while (_q < aligned_end) {
3001 unsigned long data = * (unsigned long *) _q;
3002 if (data & FAST_CHAR_MASK)
3003 break;
3004 _p[0] = ((unsigned short *) _q)[0];
3005 _p[1] = ((unsigned short *) _q)[1];
3006#if (SIZEOF_LONG == 8)
3007 _p[2] = ((unsigned short *) _q)[2];
3008 _p[3] = ((unsigned short *) _q)[3];
3009#endif
3010 _q += SIZEOF_LONG;
3011 _p += SIZEOF_LONG / 2;
3012 }
3013 }
3014 else {
3015 /* Byteswapped ordering is similar, but we must decompose
3016 the copy bytewise, and take care of zero'ing out the
3017 upper bytes if the target object is in 32-bit units
3018 (that is, in UCS-4 builds). */
3019 while (_q < aligned_end) {
3020 unsigned long data = * (unsigned long *) _q;
3021 if (data & SWAPPED_FAST_CHAR_MASK)
3022 break;
3023 /* Zero upper bytes in UCS-4 builds */
3024#if (Py_UNICODE_SIZE > 2)
3025 _p[0] = 0;
3026 _p[1] = 0;
3027#if (SIZEOF_LONG == 8)
3028 _p[2] = 0;
3029 _p[3] = 0;
3030#endif
3031#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003032 /* Issue #4916; UCS-4 builds on big endian machines must
3033 fill the two last bytes of each 4-byte unit. */
3034#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3035# define OFF 2
3036#else
3037# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003038#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003039 ((unsigned char *) _p)[OFF + 1] = _q[0];
3040 ((unsigned char *) _p)[OFF + 0] = _q[1];
3041 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3042 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3043#if (SIZEOF_LONG == 8)
3044 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3045 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3046 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3047 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3048#endif
3049#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003050 _q += SIZEOF_LONG;
3051 _p += SIZEOF_LONG / 2;
3052 }
3053 }
3054 p = _p;
3055 q = _q;
3056 if (q >= e)
3057 break;
3058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060
Benjamin Peterson14339b62009-01-31 16:36:08 +00003061 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003062
3063 if (ch < 0xD800 || ch > 0xDFFF) {
3064 *p++ = ch;
3065 continue;
3066 }
3067
3068 /* UTF-16 code pair: */
3069 if (q > e) {
3070 errmsg = "unexpected end of data";
3071 startinpos = (((const char *)q) - 2) - starts;
3072 endinpos = ((const char *)e) + 1 - starts;
3073 goto utf16Error;
3074 }
3075 if (0xD800 <= ch && ch <= 0xDBFF) {
3076 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3077 q += 2;
3078 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003079#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 *p++ = ch;
3081 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003082#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003083 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003084#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 continue;
3086 }
3087 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003088 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 startinpos = (((const char *)q)-4)-starts;
3090 endinpos = startinpos+2;
3091 goto utf16Error;
3092 }
3093
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 errmsg = "illegal encoding";
3096 startinpos = (((const char *)q)-2)-starts;
3097 endinpos = startinpos+2;
3098 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003099
Benjamin Peterson29060642009-01-31 22:14:21 +00003100 utf16Error:
3101 outpos = p - PyUnicode_AS_UNICODE(unicode);
3102 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003103 errors,
3104 &errorHandler,
3105 "utf16", errmsg,
3106 &starts,
3107 (const char **)&e,
3108 &startinpos,
3109 &endinpos,
3110 &exc,
3111 (const char **)&q,
3112 &unicode,
3113 &outpos,
3114 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003115 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003117 /* remaining byte at the end? (size should be even) */
3118 if (e == q) {
3119 if (!consumed) {
3120 errmsg = "truncated data";
3121 startinpos = ((const char *)q) - starts;
3122 endinpos = ((const char *)e) + 1 - starts;
3123 outpos = p - PyUnicode_AS_UNICODE(unicode);
3124 if (unicode_decode_call_errorhandler(
3125 errors,
3126 &errorHandler,
3127 "utf16", errmsg,
3128 &starts,
3129 (const char **)&e,
3130 &startinpos,
3131 &endinpos,
3132 &exc,
3133 (const char **)&q,
3134 &unicode,
3135 &outpos,
3136 &p))
3137 goto onError;
3138 /* The remaining input chars are ignored if the callback
3139 chooses to skip the input */
3140 }
3141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142
3143 if (byteorder)
3144 *byteorder = bo;
3145
Walter Dörwald69652032004-09-07 20:24:22 +00003146 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003147 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003148
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003150 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 goto onError;
3152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003153 Py_XDECREF(errorHandler);
3154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 return (PyObject *)unicode;
3156
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 Py_XDECREF(errorHandler);
3160 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 return NULL;
3162}
3163
Antoine Pitrouab868312009-01-10 15:40:25 +00003164#undef FAST_CHAR_MASK
3165#undef SWAPPED_FAST_CHAR_MASK
3166
Tim Peters772747b2001-08-09 22:21:55 +00003167PyObject *
3168PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 Py_ssize_t size,
3170 const char *errors,
3171 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003173 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003174 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003175 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003176#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003177 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003178#else
3179 const int pairs = 0;
3180#endif
Tim Peters772747b2001-08-09 22:21:55 +00003181 /* Offsets from p for storing byte pairs in the right order. */
3182#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3183 int ihi = 1, ilo = 0;
3184#else
3185 int ihi = 0, ilo = 1;
3186#endif
3187
Benjamin Peterson29060642009-01-31 22:14:21 +00003188#define STORECHAR(CH) \
3189 do { \
3190 p[ihi] = ((CH) >> 8) & 0xff; \
3191 p[ilo] = (CH) & 0xff; \
3192 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003193 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003195#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003196 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 if (s[i] >= 0x10000)
3198 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003199#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003200 /* 2 * (size + pairs + (byteorder == 0)) */
3201 if (size > PY_SSIZE_T_MAX ||
3202 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003203 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003204 nsize = size + pairs + (byteorder == 0);
3205 bytesize = nsize * 2;
3206 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003207 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003208 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 if (v == NULL)
3210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003212 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003215 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003216 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003217
3218 if (byteorder == -1) {
3219 /* force LE */
3220 ihi = 1;
3221 ilo = 0;
3222 }
3223 else if (byteorder == 1) {
3224 /* force BE */
3225 ihi = 0;
3226 ilo = 1;
3227 }
3228
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003229 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 Py_UNICODE ch = *s++;
3231 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003232#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 if (ch >= 0x10000) {
3234 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3235 ch = 0xD800 | ((ch-0x10000) >> 10);
3236 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003237#endif
Tim Peters772747b2001-08-09 22:21:55 +00003238 STORECHAR(ch);
3239 if (ch2)
3240 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003241 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003242
3243 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003244 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003245#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246}
3247
3248PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3249{
3250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
3252 return NULL;
3253 }
3254 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003255 PyUnicode_GET_SIZE(unicode),
3256 NULL,
3257 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258}
3259
3260/* --- Unicode Escape Codec ----------------------------------------------- */
3261
Fredrik Lundh06d12682001-01-24 07:59:11 +00003262static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 Py_ssize_t size,
3266 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003269 Py_ssize_t startinpos;
3270 Py_ssize_t endinpos;
3271 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003274 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003276 char* message;
3277 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 PyObject *errorHandler = NULL;
3279 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003280
Guido van Rossumd57fd912000-03-10 22:53:23 +00003281 /* Escaped strings will always be longer than the resulting
3282 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 length after conversion to the true value.
3284 (but if the error callback returns a long replacement string
3285 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 v = _PyUnicode_New(size);
3287 if (v == NULL)
3288 goto onError;
3289 if (size == 0)
3290 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003294
Guido van Rossumd57fd912000-03-10 22:53:23 +00003295 while (s < end) {
3296 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003297 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003298 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299
3300 /* Non-escape characters are interpreted as Unicode ordinals */
3301 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003302 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 continue;
3304 }
3305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 /* \ - Escapes */
3308 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003309 c = *s++;
3310 if (s > end)
3311 c = '\0'; /* Invalid after \ */
3312 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315 case '\n': break;
3316 case '\\': *p++ = '\\'; break;
3317 case '\'': *p++ = '\''; break;
3318 case '\"': *p++ = '\"'; break;
3319 case 'b': *p++ = '\b'; break;
3320 case 'f': *p++ = '\014'; break; /* FF */
3321 case 't': *p++ = '\t'; break;
3322 case 'n': *p++ = '\n'; break;
3323 case 'r': *p++ = '\r'; break;
3324 case 'v': *p++ = '\013'; break; /* VT */
3325 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3326
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 case '0': case '1': case '2': case '3':
3329 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003330 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003331 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003332 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003333 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003334 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003336 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 break;
3338
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 /* hex escapes */
3340 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003342 digits = 2;
3343 message = "truncated \\xXX escape";
3344 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003348 digits = 4;
3349 message = "truncated \\uXXXX escape";
3350 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351
Benjamin Peterson29060642009-01-31 22:14:21 +00003352 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003353 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003354 digits = 8;
3355 message = "truncated \\UXXXXXXXX escape";
3356 hexescape:
3357 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003358 outpos = p-PyUnicode_AS_UNICODE(v);
3359 if (s+digits>end) {
3360 endinpos = size;
3361 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 errors, &errorHandler,
3363 "unicodeescape", "end of string in escape sequence",
3364 &starts, &end, &startinpos, &endinpos, &exc, &s,
3365 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 goto onError;
3367 goto nextByte;
3368 }
3369 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003370 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003371 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003372 endinpos = (s+i+1)-starts;
3373 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003374 errors, &errorHandler,
3375 "unicodeescape", message,
3376 &starts, &end, &startinpos, &endinpos, &exc, &s,
3377 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003378 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003379 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003380 }
3381 chr = (chr<<4) & ~0xF;
3382 if (c >= '0' && c <= '9')
3383 chr += c - '0';
3384 else if (c >= 'a' && c <= 'f')
3385 chr += 10 + c - 'a';
3386 else
3387 chr += 10 + c - 'A';
3388 }
3389 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003390 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391 /* _decoding_error will have already written into the
3392 target buffer. */
3393 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003394 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003395 /* when we get here, chr is a 32-bit unicode character */
3396 if (chr <= 0xffff)
3397 /* UCS-2 character */
3398 *p++ = (Py_UNICODE) chr;
3399 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003400 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003401 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003402#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003403 *p++ = chr;
3404#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003405 chr -= 0x10000L;
3406 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003407 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003408#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003409 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003410 endinpos = s-starts;
3411 outpos = p-PyUnicode_AS_UNICODE(v);
3412 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003413 errors, &errorHandler,
3414 "unicodeescape", "illegal Unicode character",
3415 &starts, &end, &startinpos, &endinpos, &exc, &s,
3416 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003417 goto onError;
3418 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003419 break;
3420
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003422 case 'N':
3423 message = "malformed \\N character escape";
3424 if (ucnhash_CAPI == NULL) {
3425 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003426 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003427 if (ucnhash_CAPI == NULL)
3428 goto ucnhashError;
3429 }
3430 if (*s == '{') {
3431 const char *start = s+1;
3432 /* look for the closing brace */
3433 while (*s != '}' && s < end)
3434 s++;
3435 if (s > start && s < end && *s == '}') {
3436 /* found a name. look it up in the unicode database */
3437 message = "unknown Unicode character name";
3438 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003439 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003440 goto store;
3441 }
3442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 endinpos = s-starts;
3444 outpos = p-PyUnicode_AS_UNICODE(v);
3445 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 errors, &errorHandler,
3447 "unicodeescape", message,
3448 &starts, &end, &startinpos, &endinpos, &exc, &s,
3449 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003450 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003451 break;
3452
3453 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003454 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 message = "\\ at end of string";
3456 s--;
3457 endinpos = s-starts;
3458 outpos = p-PyUnicode_AS_UNICODE(v);
3459 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003460 errors, &errorHandler,
3461 "unicodeescape", message,
3462 &starts, &end, &startinpos, &endinpos, &exc, &s,
3463 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003464 goto onError;
3465 }
3466 else {
3467 *p++ = '\\';
3468 *p++ = (unsigned char)s[-1];
3469 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003470 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003472 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003475 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003477 Py_XDECREF(errorHandler);
3478 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003480
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003482 PyErr_SetString(
3483 PyExc_UnicodeError,
3484 "\\N escapes not supported (can't load unicodedata module)"
3485 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003486 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 Py_XDECREF(errorHandler);
3488 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003489 return NULL;
3490
Benjamin Peterson29060642009-01-31 22:14:21 +00003491 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 Py_XDECREF(errorHandler);
3494 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 return NULL;
3496}
3497
3498/* Return a Unicode-Escape string version of the Unicode object.
3499
3500 If quotes is true, the string is enclosed in u"" or u'' quotes as
3501 appropriate.
3502
3503*/
3504
Thomas Wouters477c8d52006-05-27 19:21:47 +00003505Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003506 Py_ssize_t size,
3507 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003508{
3509 /* like wcschr, but doesn't stop at NULL characters */
3510
3511 while (size-- > 0) {
3512 if (*s == ch)
3513 return s;
3514 s++;
3515 }
3516
3517 return NULL;
3518}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003519
Walter Dörwald79e913e2007-05-12 11:08:06 +00003520static const char *hexdigits = "0123456789abcdef";
3521
3522PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003525 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003528#ifdef Py_UNICODE_WIDE
3529 const Py_ssize_t expandsize = 10;
3530#else
3531 const Py_ssize_t expandsize = 6;
3532#endif
3533
Thomas Wouters89f507f2006-12-13 04:49:30 +00003534 /* XXX(nnorwitz): rather than over-allocating, it would be
3535 better to choose a different scheme. Perhaps scan the
3536 first N-chars of the string and allocate based on that size.
3537 */
3538 /* Initial allocation is based on the longest-possible unichr
3539 escape.
3540
3541 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3542 unichr, so in this case it's the longest unichr escape. In
3543 narrow (UTF-16) builds this is five chars per source unichr
3544 since there are two unichrs in the surrogate pair, so in narrow
3545 (UTF-16) builds it's not the longest unichr escape.
3546
3547 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3548 so in the narrow (UTF-16) build case it's the longest unichr
3549 escape.
3550 */
3551
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003552 if (size == 0)
3553 return PyBytes_FromStringAndSize(NULL, 0);
3554
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003555 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003556 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003557
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003558 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 2
3560 + expandsize*size
3561 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 if (repr == NULL)
3563 return NULL;
3564
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003565 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 while (size-- > 0) {
3568 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003569
Walter Dörwald79e913e2007-05-12 11:08:06 +00003570 /* Escape backslashes */
3571 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 *p++ = '\\';
3573 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003574 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003575 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003576
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003577#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003578 /* Map 21-bit characters to '\U00xxxxxx' */
3579 else if (ch >= 0x10000) {
3580 *p++ = '\\';
3581 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003582 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3583 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3584 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3585 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3586 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3587 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3588 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3589 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003591 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003592#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3594 else if (ch >= 0xD800 && ch < 0xDC00) {
3595 Py_UNICODE ch2;
3596 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003597
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 ch2 = *s++;
3599 size--;
3600 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3601 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3602 *p++ = '\\';
3603 *p++ = 'U';
3604 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3605 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3606 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3607 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3608 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3609 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3610 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3611 *p++ = hexdigits[ucs & 0x0000000F];
3612 continue;
3613 }
3614 /* Fall through: isolated surrogates are copied as-is */
3615 s--;
3616 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003617 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003618#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003619
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003621 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622 *p++ = '\\';
3623 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003624 *p++ = hexdigits[(ch >> 12) & 0x000F];
3625 *p++ = hexdigits[(ch >> 8) & 0x000F];
3626 *p++ = hexdigits[(ch >> 4) & 0x000F];
3627 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003629
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003630 /* Map special whitespace to '\t', \n', '\r' */
3631 else if (ch == '\t') {
3632 *p++ = '\\';
3633 *p++ = 't';
3634 }
3635 else if (ch == '\n') {
3636 *p++ = '\\';
3637 *p++ = 'n';
3638 }
3639 else if (ch == '\r') {
3640 *p++ = '\\';
3641 *p++ = 'r';
3642 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003643
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003644 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003645 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003647 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003648 *p++ = hexdigits[(ch >> 4) & 0x000F];
3649 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003650 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 /* Copy everything else as-is */
3653 else
3654 *p++ = (char) ch;
3655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003657 assert(p - PyBytes_AS_STRING(repr) > 0);
3658 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3659 return NULL;
3660 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661}
3662
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003663PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003665 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 if (!PyUnicode_Check(unicode)) {
3667 PyErr_BadArgument();
3668 return NULL;
3669 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003670 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3671 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003672 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673}
3674
3675/* --- Raw Unicode Escape Codec ------------------------------------------- */
3676
3677PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 Py_ssize_t size,
3679 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003682 Py_ssize_t startinpos;
3683 Py_ssize_t endinpos;
3684 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 const char *end;
3688 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 PyObject *errorHandler = NULL;
3690 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003691
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 /* Escaped strings will always be longer than the resulting
3693 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 length after conversion to the true value. (But decoding error
3695 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 v = _PyUnicode_New(size);
3697 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 end = s + size;
3703 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 unsigned char c;
3705 Py_UCS4 x;
3706 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003707 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 /* Non-escape characters are interpreted as Unicode ordinals */
3710 if (*s != '\\') {
3711 *p++ = (unsigned char)*s++;
3712 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 startinpos = s-starts;
3715
3716 /* \u-escapes are only interpreted iff the number of leading
3717 backslashes if odd */
3718 bs = s;
3719 for (;s < end;) {
3720 if (*s != '\\')
3721 break;
3722 *p++ = (unsigned char)*s++;
3723 }
3724 if (((s - bs) & 1) == 0 ||
3725 s >= end ||
3726 (*s != 'u' && *s != 'U')) {
3727 continue;
3728 }
3729 p--;
3730 count = *s=='u' ? 4 : 8;
3731 s++;
3732
3733 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3734 outpos = p-PyUnicode_AS_UNICODE(v);
3735 for (x = 0, i = 0; i < count; ++i, ++s) {
3736 c = (unsigned char)*s;
3737 if (!ISXDIGIT(c)) {
3738 endinpos = s-starts;
3739 if (unicode_decode_call_errorhandler(
3740 errors, &errorHandler,
3741 "rawunicodeescape", "truncated \\uXXXX",
3742 &starts, &end, &startinpos, &endinpos, &exc, &s,
3743 &v, &outpos, &p))
3744 goto onError;
3745 goto nextByte;
3746 }
3747 x = (x<<4) & ~0xF;
3748 if (c >= '0' && c <= '9')
3749 x += c - '0';
3750 else if (c >= 'a' && c <= 'f')
3751 x += 10 + c - 'a';
3752 else
3753 x += 10 + c - 'A';
3754 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003755 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 /* UCS-2 character */
3757 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003758 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 /* UCS-4 character. Either store directly, or as
3760 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003761#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003763#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 x -= 0x10000L;
3765 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3766 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003767#endif
3768 } else {
3769 endinpos = s-starts;
3770 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003771 if (unicode_decode_call_errorhandler(
3772 errors, &errorHandler,
3773 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003774 &starts, &end, &startinpos, &endinpos, &exc, &s,
3775 &v, &outpos, &p))
3776 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003777 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 nextByte:
3779 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003781 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_XDECREF(errorHandler);
3784 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 Py_XDECREF(errorHandler);
3790 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 return NULL;
3792}
3793
3794PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003797 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 char *p;
3799 char *q;
3800
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003801#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003802 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003803#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003804 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003805#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003806
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003807 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003809
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003810 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 if (repr == NULL)
3812 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003813 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003814 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003816 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 while (size-- > 0) {
3818 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003819#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 /* Map 32-bit characters to '\Uxxxxxxxx' */
3821 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003822 *p++ = '\\';
3823 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003824 *p++ = hexdigits[(ch >> 28) & 0xf];
3825 *p++ = hexdigits[(ch >> 24) & 0xf];
3826 *p++ = hexdigits[(ch >> 20) & 0xf];
3827 *p++ = hexdigits[(ch >> 16) & 0xf];
3828 *p++ = hexdigits[(ch >> 12) & 0xf];
3829 *p++ = hexdigits[(ch >> 8) & 0xf];
3830 *p++ = hexdigits[(ch >> 4) & 0xf];
3831 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003832 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003833 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003834#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003835 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3836 if (ch >= 0xD800 && ch < 0xDC00) {
3837 Py_UNICODE ch2;
3838 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003839
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 ch2 = *s++;
3841 size--;
3842 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3843 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3844 *p++ = '\\';
3845 *p++ = 'U';
3846 *p++ = hexdigits[(ucs >> 28) & 0xf];
3847 *p++ = hexdigits[(ucs >> 24) & 0xf];
3848 *p++ = hexdigits[(ucs >> 20) & 0xf];
3849 *p++ = hexdigits[(ucs >> 16) & 0xf];
3850 *p++ = hexdigits[(ucs >> 12) & 0xf];
3851 *p++ = hexdigits[(ucs >> 8) & 0xf];
3852 *p++ = hexdigits[(ucs >> 4) & 0xf];
3853 *p++ = hexdigits[ucs & 0xf];
3854 continue;
3855 }
3856 /* Fall through: isolated surrogates are copied as-is */
3857 s--;
3858 size++;
3859 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003860#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 /* Map 16-bit characters to '\uxxxx' */
3862 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 *p++ = '\\';
3864 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003865 *p++ = hexdigits[(ch >> 12) & 0xf];
3866 *p++ = hexdigits[(ch >> 8) & 0xf];
3867 *p++ = hexdigits[(ch >> 4) & 0xf];
3868 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003870 /* Copy everything else as-is */
3871 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 *p++ = (char) ch;
3873 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003874 size = p - q;
3875
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003876 assert(size > 0);
3877 if (_PyBytes_Resize(&repr, size) < 0)
3878 return NULL;
3879 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880}
3881
3882PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3883{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003884 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003886 PyErr_BadArgument();
3887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003889 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3890 PyUnicode_GET_SIZE(unicode));
3891
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003892 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893}
3894
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003895/* --- Unicode Internal Codec ------------------------------------------- */
3896
3897PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 Py_ssize_t size,
3899 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003900{
3901 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003902 Py_ssize_t startinpos;
3903 Py_ssize_t endinpos;
3904 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003905 PyUnicodeObject *v;
3906 Py_UNICODE *p;
3907 const char *end;
3908 const char *reason;
3909 PyObject *errorHandler = NULL;
3910 PyObject *exc = NULL;
3911
Neal Norwitzd43069c2006-01-08 01:12:10 +00003912#ifdef Py_UNICODE_WIDE
3913 Py_UNICODE unimax = PyUnicode_GetMax();
3914#endif
3915
Thomas Wouters89f507f2006-12-13 04:49:30 +00003916 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003917 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3918 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003920 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003922 p = PyUnicode_AS_UNICODE(v);
3923 end = s + size;
3924
3925 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003926 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003927 /* We have to sanity check the raw data, otherwise doom looms for
3928 some malformed UCS-4 data. */
3929 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003930#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003931 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003932#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003933 end-s < Py_UNICODE_SIZE
3934 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003935 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003936 startinpos = s - starts;
3937 if (end-s < Py_UNICODE_SIZE) {
3938 endinpos = end-starts;
3939 reason = "truncated input";
3940 }
3941 else {
3942 endinpos = s - starts + Py_UNICODE_SIZE;
3943 reason = "illegal code point (> 0x10FFFF)";
3944 }
3945 outpos = p - PyUnicode_AS_UNICODE(v);
3946 if (unicode_decode_call_errorhandler(
3947 errors, &errorHandler,
3948 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003949 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003950 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003951 goto onError;
3952 }
3953 }
3954 else {
3955 p++;
3956 s += Py_UNICODE_SIZE;
3957 }
3958 }
3959
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003960 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003961 goto onError;
3962 Py_XDECREF(errorHandler);
3963 Py_XDECREF(exc);
3964 return (PyObject *)v;
3965
Benjamin Peterson29060642009-01-31 22:14:21 +00003966 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003967 Py_XDECREF(v);
3968 Py_XDECREF(errorHandler);
3969 Py_XDECREF(exc);
3970 return NULL;
3971}
3972
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973/* --- Latin-1 Codec ------------------------------------------------------ */
3974
3975PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 Py_ssize_t size,
3977 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978{
3979 PyUnicodeObject *v;
3980 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003981 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003982
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003984 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003985 Py_UNICODE r = *(unsigned char*)s;
3986 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003987 }
3988
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 v = _PyUnicode_New(size);
3990 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003995 e = s + size;
3996 /* Unrolling the copy makes it much faster by reducing the looping
3997 overhead. This is similar to what many memcpy() implementations do. */
3998 unrolled_end = e - 4;
3999 while (s < unrolled_end) {
4000 p[0] = (unsigned char) s[0];
4001 p[1] = (unsigned char) s[1];
4002 p[2] = (unsigned char) s[2];
4003 p[3] = (unsigned char) s[3];
4004 s += 4;
4005 p += 4;
4006 }
4007 while (s < e)
4008 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004010
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 Py_XDECREF(v);
4013 return NULL;
4014}
4015
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016/* create or adjust a UnicodeEncodeError */
4017static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004018 const char *encoding,
4019 const Py_UNICODE *unicode, Py_ssize_t size,
4020 Py_ssize_t startpos, Py_ssize_t endpos,
4021 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004023 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004024 *exceptionObject = PyUnicodeEncodeError_Create(
4025 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 }
4027 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4029 goto onError;
4030 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4031 goto onError;
4032 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4033 goto onError;
4034 return;
4035 onError:
4036 Py_DECREF(*exceptionObject);
4037 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 }
4039}
4040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041/* raises a UnicodeEncodeError */
4042static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 const char *encoding,
4044 const Py_UNICODE *unicode, Py_ssize_t size,
4045 Py_ssize_t startpos, Py_ssize_t endpos,
4046 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047{
4048 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004052}
4053
4054/* error handling callback helper:
4055 build arguments, call the callback and check the arguments,
4056 put the result into newpos and return the replacement string, which
4057 has to be freed by the caller */
4058static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 PyObject **errorHandler,
4060 const char *encoding, const char *reason,
4061 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4062 Py_ssize_t startpos, Py_ssize_t endpos,
4063 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004065 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066
4067 PyObject *restuple;
4068 PyObject *resunicode;
4069
4070 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 }
4075
4076 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080
4081 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004086 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004087 Py_DECREF(restuple);
4088 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004090 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 &resunicode, newpos)) {
4092 Py_DECREF(restuple);
4093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004095 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4096 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4097 Py_DECREF(restuple);
4098 return NULL;
4099 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004102 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4104 Py_DECREF(restuple);
4105 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 Py_INCREF(resunicode);
4108 Py_DECREF(restuple);
4109 return resunicode;
4110}
4111
4112static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 Py_ssize_t size,
4114 const char *errors,
4115 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004116{
4117 /* output object */
4118 PyObject *res;
4119 /* pointers to the beginning and end+1 of input */
4120 const Py_UNICODE *startp = p;
4121 const Py_UNICODE *endp = p + size;
4122 /* pointer to the beginning of the unencodable characters */
4123 /* const Py_UNICODE *badp = NULL; */
4124 /* pointer into the output */
4125 char *str;
4126 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004127 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004128 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4129 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 PyObject *errorHandler = NULL;
4131 PyObject *exc = NULL;
4132 /* the following variable is used for caching string comparisons
4133 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4134 int known_errorHandler = -1;
4135
4136 /* allocate enough for a simple encoding without
4137 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004138 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004139 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004140 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004142 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004143 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 ressize = size;
4145
4146 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 /* can we encode this? */
4150 if (c<limit) {
4151 /* no overflow check, because we know that the space is enough */
4152 *str++ = (char)c;
4153 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004154 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 else {
4156 Py_ssize_t unicodepos = p-startp;
4157 Py_ssize_t requiredsize;
4158 PyObject *repunicode;
4159 Py_ssize_t repsize;
4160 Py_ssize_t newpos;
4161 Py_ssize_t respos;
4162 Py_UNICODE *uni2;
4163 /* startpos for collecting unencodable chars */
4164 const Py_UNICODE *collstart = p;
4165 const Py_UNICODE *collend = p;
4166 /* find all unecodable characters */
4167 while ((collend < endp) && ((*collend)>=limit))
4168 ++collend;
4169 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4170 if (known_errorHandler==-1) {
4171 if ((errors==NULL) || (!strcmp(errors, "strict")))
4172 known_errorHandler = 1;
4173 else if (!strcmp(errors, "replace"))
4174 known_errorHandler = 2;
4175 else if (!strcmp(errors, "ignore"))
4176 known_errorHandler = 3;
4177 else if (!strcmp(errors, "xmlcharrefreplace"))
4178 known_errorHandler = 4;
4179 else
4180 known_errorHandler = 0;
4181 }
4182 switch (known_errorHandler) {
4183 case 1: /* strict */
4184 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4185 goto onError;
4186 case 2: /* replace */
4187 while (collstart++<collend)
4188 *str++ = '?'; /* fall through */
4189 case 3: /* ignore */
4190 p = collend;
4191 break;
4192 case 4: /* xmlcharrefreplace */
4193 respos = str - PyBytes_AS_STRING(res);
4194 /* determine replacement size (temporarily (mis)uses p) */
4195 for (p = collstart, repsize = 0; p < collend; ++p) {
4196 if (*p<10)
4197 repsize += 2+1+1;
4198 else if (*p<100)
4199 repsize += 2+2+1;
4200 else if (*p<1000)
4201 repsize += 2+3+1;
4202 else if (*p<10000)
4203 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004204#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 else
4206 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004207#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 else if (*p<100000)
4209 repsize += 2+5+1;
4210 else if (*p<1000000)
4211 repsize += 2+6+1;
4212 else
4213 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004214#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 }
4216 requiredsize = respos+repsize+(endp-collend);
4217 if (requiredsize > ressize) {
4218 if (requiredsize<2*ressize)
4219 requiredsize = 2*ressize;
4220 if (_PyBytes_Resize(&res, requiredsize))
4221 goto onError;
4222 str = PyBytes_AS_STRING(res) + respos;
4223 ressize = requiredsize;
4224 }
4225 /* generate replacement (temporarily (mis)uses p) */
4226 for (p = collstart; p < collend; ++p) {
4227 str += sprintf(str, "&#%d;", (int)*p);
4228 }
4229 p = collend;
4230 break;
4231 default:
4232 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4233 encoding, reason, startp, size, &exc,
4234 collstart-startp, collend-startp, &newpos);
4235 if (repunicode == NULL)
4236 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004237 if (PyBytes_Check(repunicode)) {
4238 /* Directly copy bytes result to output. */
4239 repsize = PyBytes_Size(repunicode);
4240 if (repsize > 1) {
4241 /* Make room for all additional bytes. */
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004242 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004243 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4244 Py_DECREF(repunicode);
4245 goto onError;
4246 }
Amaury Forgeot d'Arce5344d62009-06-29 22:38:54 +00004247 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004248 ressize += repsize-1;
4249 }
4250 memcpy(str, PyBytes_AsString(repunicode), repsize);
4251 str += repsize;
4252 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004253 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004254 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004255 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 /* need more space? (at least enough for what we
4257 have+the replacement+the rest of the string, so
4258 we won't have to check space for encodable characters) */
4259 respos = str - PyBytes_AS_STRING(res);
4260 repsize = PyUnicode_GET_SIZE(repunicode);
4261 requiredsize = respos+repsize+(endp-collend);
4262 if (requiredsize > ressize) {
4263 if (requiredsize<2*ressize)
4264 requiredsize = 2*ressize;
4265 if (_PyBytes_Resize(&res, requiredsize)) {
4266 Py_DECREF(repunicode);
4267 goto onError;
4268 }
4269 str = PyBytes_AS_STRING(res) + respos;
4270 ressize = requiredsize;
4271 }
4272 /* check if there is anything unencodable in the replacement
4273 and copy it to the output */
4274 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4275 c = *uni2;
4276 if (c >= limit) {
4277 raise_encode_exception(&exc, encoding, startp, size,
4278 unicodepos, unicodepos+1, reason);
4279 Py_DECREF(repunicode);
4280 goto onError;
4281 }
4282 *str = (char)c;
4283 }
4284 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004285 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004287 }
4288 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004289 /* Resize if we allocated to much */
4290 size = str - PyBytes_AS_STRING(res);
4291 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004292 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004293 if (_PyBytes_Resize(&res, size) < 0)
4294 goto onError;
4295 }
4296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 Py_XDECREF(errorHandler);
4298 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004299 return res;
4300
4301 onError:
4302 Py_XDECREF(res);
4303 Py_XDECREF(errorHandler);
4304 Py_XDECREF(exc);
4305 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306}
4307
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 Py_ssize_t size,
4310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313}
4314
4315PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4316{
4317 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 PyErr_BadArgument();
4319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 }
4321 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 PyUnicode_GET_SIZE(unicode),
4323 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324}
4325
4326/* --- 7-bit ASCII Codec -------------------------------------------------- */
4327
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 Py_ssize_t size,
4330 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 PyUnicodeObject *v;
4334 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004335 Py_ssize_t startinpos;
4336 Py_ssize_t endinpos;
4337 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 const char *e;
4339 PyObject *errorHandler = NULL;
4340 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004341
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004343 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 Py_UNICODE r = *(unsigned char*)s;
4345 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004346 }
Tim Petersced69f82003-09-16 20:30:58 +00004347
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 v = _PyUnicode_New(size);
4349 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 e = s + size;
4355 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 register unsigned char c = (unsigned char)*s;
4357 if (c < 128) {
4358 *p++ = c;
4359 ++s;
4360 }
4361 else {
4362 startinpos = s-starts;
4363 endinpos = startinpos + 1;
4364 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4365 if (unicode_decode_call_errorhandler(
4366 errors, &errorHandler,
4367 "ascii", "ordinal not in range(128)",
4368 &starts, &e, &startinpos, &endinpos, &exc, &s,
4369 &v, &outpos, &p))
4370 goto onError;
4371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004373 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4375 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 Py_XDECREF(errorHandler);
4377 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004379
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 Py_XDECREF(errorHandler);
4383 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 return NULL;
4385}
4386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 Py_ssize_t size,
4389 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392}
4393
4394PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4395{
4396 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 PyErr_BadArgument();
4398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 }
4400 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 PyUnicode_GET_SIZE(unicode),
4402 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403}
4404
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004405#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004406
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004407/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004408
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004409#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004410#define NEED_RETRY
4411#endif
4412
4413/* XXX This code is limited to "true" double-byte encodings, as
4414 a) it assumes an incomplete character consists of a single byte, and
4415 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004417
4418static int is_dbcs_lead_byte(const char *s, int offset)
4419{
4420 const char *curr = s + offset;
4421
4422 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 const char *prev = CharPrev(s, curr);
4424 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004425 }
4426 return 0;
4427}
4428
4429/*
4430 * Decode MBCS string into unicode object. If 'final' is set, converts
4431 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4432 */
4433static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 const char *s, /* MBCS string */
4435 int size, /* sizeof MBCS string */
4436 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004437{
4438 Py_UNICODE *p;
4439 Py_ssize_t n = 0;
4440 int usize = 0;
4441
4442 assert(size >= 0);
4443
4444 /* Skip trailing lead-byte unless 'final' is set */
4445 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004447
4448 /* First get the size of the result */
4449 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4451 if (usize == 0) {
4452 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4453 return -1;
4454 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004455 }
4456
4457 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 /* Create unicode object */
4459 *v = _PyUnicode_New(usize);
4460 if (*v == NULL)
4461 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004462 }
4463 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 /* Extend unicode object */
4465 n = PyUnicode_GET_SIZE(*v);
4466 if (_PyUnicode_Resize(v, n + usize) < 0)
4467 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004468 }
4469
4470 /* Do the conversion */
4471 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 p = PyUnicode_AS_UNICODE(*v) + n;
4473 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4474 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4475 return -1;
4476 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004477 }
4478
4479 return size;
4480}
4481
4482PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 Py_ssize_t size,
4484 const char *errors,
4485 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004486{
4487 PyUnicodeObject *v = NULL;
4488 int done;
4489
4490 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004492
4493#ifdef NEED_RETRY
4494 retry:
4495 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004497 else
4498#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004500
4501 if (done < 0) {
4502 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004504 }
4505
4506 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004508
4509#ifdef NEED_RETRY
4510 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 s += done;
4512 size -= done;
4513 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004514 }
4515#endif
4516
4517 return (PyObject *)v;
4518}
4519
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004520PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 Py_ssize_t size,
4522 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004523{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004524 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4525}
4526
4527/*
4528 * Convert unicode into string object (MBCS).
4529 * Returns 0 if succeed, -1 otherwise.
4530 */
4531static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 const Py_UNICODE *p, /* unicode */
4533 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004534{
4535 int mbcssize = 0;
4536 Py_ssize_t n = 0;
4537
4538 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004539
4540 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004541 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4543 if (mbcssize == 0) {
4544 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4545 return -1;
4546 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004547 }
4548
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004549 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 /* Create string object */
4551 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4552 if (*repr == NULL)
4553 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004554 }
4555 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 /* Extend string object */
4557 n = PyBytes_Size(*repr);
4558 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4559 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004560 }
4561
4562 /* Do the conversion */
4563 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 char *s = PyBytes_AS_STRING(*repr) + n;
4565 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4566 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4567 return -1;
4568 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004569 }
4570
4571 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004572}
4573
4574PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 Py_ssize_t size,
4576 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004577{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004578 PyObject *repr = NULL;
4579 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004580
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004583 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004585 else
4586#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004588
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004589 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 Py_XDECREF(repr);
4591 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004592 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004593
4594#ifdef NEED_RETRY
4595 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 p += INT_MAX;
4597 size -= INT_MAX;
4598 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004599 }
4600#endif
4601
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004602 return repr;
4603}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004604
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004605PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4606{
4607 if (!PyUnicode_Check(unicode)) {
4608 PyErr_BadArgument();
4609 return NULL;
4610 }
4611 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 PyUnicode_GET_SIZE(unicode),
4613 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004614}
4615
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004616#undef NEED_RETRY
4617
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004618#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004619
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620/* --- Character Mapping Codec -------------------------------------------- */
4621
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 Py_ssize_t size,
4624 PyObject *mapping,
4625 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004628 Py_ssize_t startinpos;
4629 Py_ssize_t endinpos;
4630 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 PyUnicodeObject *v;
4633 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004634 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004635 PyObject *errorHandler = NULL;
4636 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004637 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004639
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640 /* Default to Latin-1 */
4641 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643
4644 v = _PyUnicode_New(size);
4645 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004651 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004652 mapstring = PyUnicode_AS_UNICODE(mapping);
4653 maplen = PyUnicode_GET_SIZE(mapping);
4654 while (s < e) {
4655 unsigned char ch = *s;
4656 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 if (ch < maplen)
4659 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 if (x == 0xfffe) {
4662 /* undefined mapping */
4663 outpos = p-PyUnicode_AS_UNICODE(v);
4664 startinpos = s-starts;
4665 endinpos = startinpos+1;
4666 if (unicode_decode_call_errorhandler(
4667 errors, &errorHandler,
4668 "charmap", "character maps to <undefined>",
4669 &starts, &e, &startinpos, &endinpos, &exc, &s,
4670 &v, &outpos, &p)) {
4671 goto onError;
4672 }
4673 continue;
4674 }
4675 *p++ = x;
4676 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004677 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004678 }
4679 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 while (s < e) {
4681 unsigned char ch = *s;
4682 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004683
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4685 w = PyLong_FromLong((long)ch);
4686 if (w == NULL)
4687 goto onError;
4688 x = PyObject_GetItem(mapping, w);
4689 Py_DECREF(w);
4690 if (x == NULL) {
4691 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4692 /* No mapping found means: mapping is undefined. */
4693 PyErr_Clear();
4694 x = Py_None;
4695 Py_INCREF(x);
4696 } else
4697 goto onError;
4698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004699
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 /* Apply mapping */
4701 if (PyLong_Check(x)) {
4702 long value = PyLong_AS_LONG(x);
4703 if (value < 0 || value > 65535) {
4704 PyErr_SetString(PyExc_TypeError,
4705 "character mapping must be in range(65536)");
4706 Py_DECREF(x);
4707 goto onError;
4708 }
4709 *p++ = (Py_UNICODE)value;
4710 }
4711 else if (x == Py_None) {
4712 /* undefined mapping */
4713 outpos = p-PyUnicode_AS_UNICODE(v);
4714 startinpos = s-starts;
4715 endinpos = startinpos+1;
4716 if (unicode_decode_call_errorhandler(
4717 errors, &errorHandler,
4718 "charmap", "character maps to <undefined>",
4719 &starts, &e, &startinpos, &endinpos, &exc, &s,
4720 &v, &outpos, &p)) {
4721 Py_DECREF(x);
4722 goto onError;
4723 }
4724 Py_DECREF(x);
4725 continue;
4726 }
4727 else if (PyUnicode_Check(x)) {
4728 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004729
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 if (targetsize == 1)
4731 /* 1-1 mapping */
4732 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004733
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 else if (targetsize > 1) {
4735 /* 1-n mapping */
4736 if (targetsize > extrachars) {
4737 /* resize first */
4738 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4739 Py_ssize_t needed = (targetsize - extrachars) + \
4740 (targetsize << 2);
4741 extrachars += needed;
4742 /* XXX overflow detection missing */
4743 if (_PyUnicode_Resize(&v,
4744 PyUnicode_GET_SIZE(v) + needed) < 0) {
4745 Py_DECREF(x);
4746 goto onError;
4747 }
4748 p = PyUnicode_AS_UNICODE(v) + oldpos;
4749 }
4750 Py_UNICODE_COPY(p,
4751 PyUnicode_AS_UNICODE(x),
4752 targetsize);
4753 p += targetsize;
4754 extrachars -= targetsize;
4755 }
4756 /* 1-0 mapping: skip the character */
4757 }
4758 else {
4759 /* wrong return value */
4760 PyErr_SetString(PyExc_TypeError,
4761 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004762 Py_DECREF(x);
4763 goto onError;
4764 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 Py_DECREF(x);
4766 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 }
4769 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4771 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 Py_XDECREF(errorHandler);
4773 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004775
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 Py_XDECREF(errorHandler);
4778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 Py_XDECREF(v);
4780 return NULL;
4781}
4782
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004783/* Charmap encoding: the lookup table */
4784
4785struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 PyObject_HEAD
4787 unsigned char level1[32];
4788 int count2, count3;
4789 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004790};
4791
4792static PyObject*
4793encoding_map_size(PyObject *obj, PyObject* args)
4794{
4795 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004796 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004798}
4799
4800static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004801 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 PyDoc_STR("Return the size (in bytes) of this object") },
4803 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004804};
4805
4806static void
4807encoding_map_dealloc(PyObject* o)
4808{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004809 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004810}
4811
4812static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004813 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 "EncodingMap", /*tp_name*/
4815 sizeof(struct encoding_map), /*tp_basicsize*/
4816 0, /*tp_itemsize*/
4817 /* methods */
4818 encoding_map_dealloc, /*tp_dealloc*/
4819 0, /*tp_print*/
4820 0, /*tp_getattr*/
4821 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004822 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004823 0, /*tp_repr*/
4824 0, /*tp_as_number*/
4825 0, /*tp_as_sequence*/
4826 0, /*tp_as_mapping*/
4827 0, /*tp_hash*/
4828 0, /*tp_call*/
4829 0, /*tp_str*/
4830 0, /*tp_getattro*/
4831 0, /*tp_setattro*/
4832 0, /*tp_as_buffer*/
4833 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4834 0, /*tp_doc*/
4835 0, /*tp_traverse*/
4836 0, /*tp_clear*/
4837 0, /*tp_richcompare*/
4838 0, /*tp_weaklistoffset*/
4839 0, /*tp_iter*/
4840 0, /*tp_iternext*/
4841 encoding_map_methods, /*tp_methods*/
4842 0, /*tp_members*/
4843 0, /*tp_getset*/
4844 0, /*tp_base*/
4845 0, /*tp_dict*/
4846 0, /*tp_descr_get*/
4847 0, /*tp_descr_set*/
4848 0, /*tp_dictoffset*/
4849 0, /*tp_init*/
4850 0, /*tp_alloc*/
4851 0, /*tp_new*/
4852 0, /*tp_free*/
4853 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004854};
4855
4856PyObject*
4857PyUnicode_BuildEncodingMap(PyObject* string)
4858{
4859 Py_UNICODE *decode;
4860 PyObject *result;
4861 struct encoding_map *mresult;
4862 int i;
4863 int need_dict = 0;
4864 unsigned char level1[32];
4865 unsigned char level2[512];
4866 unsigned char *mlevel1, *mlevel2, *mlevel3;
4867 int count2 = 0, count3 = 0;
4868
4869 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4870 PyErr_BadArgument();
4871 return NULL;
4872 }
4873 decode = PyUnicode_AS_UNICODE(string);
4874 memset(level1, 0xFF, sizeof level1);
4875 memset(level2, 0xFF, sizeof level2);
4876
4877 /* If there isn't a one-to-one mapping of NULL to \0,
4878 or if there are non-BMP characters, we need to use
4879 a mapping dictionary. */
4880 if (decode[0] != 0)
4881 need_dict = 1;
4882 for (i = 1; i < 256; i++) {
4883 int l1, l2;
4884 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004885#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004886 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004887#endif
4888 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004889 need_dict = 1;
4890 break;
4891 }
4892 if (decode[i] == 0xFFFE)
4893 /* unmapped character */
4894 continue;
4895 l1 = decode[i] >> 11;
4896 l2 = decode[i] >> 7;
4897 if (level1[l1] == 0xFF)
4898 level1[l1] = count2++;
4899 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004900 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004901 }
4902
4903 if (count2 >= 0xFF || count3 >= 0xFF)
4904 need_dict = 1;
4905
4906 if (need_dict) {
4907 PyObject *result = PyDict_New();
4908 PyObject *key, *value;
4909 if (!result)
4910 return NULL;
4911 for (i = 0; i < 256; i++) {
4912 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004913 key = PyLong_FromLong(decode[i]);
4914 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004915 if (!key || !value)
4916 goto failed1;
4917 if (PyDict_SetItem(result, key, value) == -1)
4918 goto failed1;
4919 Py_DECREF(key);
4920 Py_DECREF(value);
4921 }
4922 return result;
4923 failed1:
4924 Py_XDECREF(key);
4925 Py_XDECREF(value);
4926 Py_DECREF(result);
4927 return NULL;
4928 }
4929
4930 /* Create a three-level trie */
4931 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4932 16*count2 + 128*count3 - 1);
4933 if (!result)
4934 return PyErr_NoMemory();
4935 PyObject_Init(result, &EncodingMapType);
4936 mresult = (struct encoding_map*)result;
4937 mresult->count2 = count2;
4938 mresult->count3 = count3;
4939 mlevel1 = mresult->level1;
4940 mlevel2 = mresult->level23;
4941 mlevel3 = mresult->level23 + 16*count2;
4942 memcpy(mlevel1, level1, 32);
4943 memset(mlevel2, 0xFF, 16*count2);
4944 memset(mlevel3, 0, 128*count3);
4945 count3 = 0;
4946 for (i = 1; i < 256; i++) {
4947 int o1, o2, o3, i2, i3;
4948 if (decode[i] == 0xFFFE)
4949 /* unmapped character */
4950 continue;
4951 o1 = decode[i]>>11;
4952 o2 = (decode[i]>>7) & 0xF;
4953 i2 = 16*mlevel1[o1] + o2;
4954 if (mlevel2[i2] == 0xFF)
4955 mlevel2[i2] = count3++;
4956 o3 = decode[i] & 0x7F;
4957 i3 = 128*mlevel2[i2] + o3;
4958 mlevel3[i3] = i;
4959 }
4960 return result;
4961}
4962
4963static int
4964encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4965{
4966 struct encoding_map *map = (struct encoding_map*)mapping;
4967 int l1 = c>>11;
4968 int l2 = (c>>7) & 0xF;
4969 int l3 = c & 0x7F;
4970 int i;
4971
4972#ifdef Py_UNICODE_WIDE
4973 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004975 }
4976#endif
4977 if (c == 0)
4978 return 0;
4979 /* level 1*/
4980 i = map->level1[l1];
4981 if (i == 0xFF) {
4982 return -1;
4983 }
4984 /* level 2*/
4985 i = map->level23[16*i+l2];
4986 if (i == 0xFF) {
4987 return -1;
4988 }
4989 /* level 3 */
4990 i = map->level23[16*map->count2 + 128*i + l3];
4991 if (i == 0) {
4992 return -1;
4993 }
4994 return i;
4995}
4996
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004997/* Lookup the character ch in the mapping. If the character
4998 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004999 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001{
Christian Heimes217cfd12007-12-02 14:31:20 +00005002 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 PyObject *x;
5004
5005 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 x = PyObject_GetItem(mapping, w);
5008 Py_DECREF(w);
5009 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5011 /* No mapping found means: mapping is undefined. */
5012 PyErr_Clear();
5013 x = Py_None;
5014 Py_INCREF(x);
5015 return x;
5016 } else
5017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005019 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005021 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 long value = PyLong_AS_LONG(x);
5023 if (value < 0 || value > 255) {
5024 PyErr_SetString(PyExc_TypeError,
5025 "character mapping must be in range(256)");
5026 Py_DECREF(x);
5027 return NULL;
5028 }
5029 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005031 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 /* wrong return value */
5035 PyErr_Format(PyExc_TypeError,
5036 "character mapping must return integer, bytes or None, not %.400s",
5037 x->ob_type->tp_name);
5038 Py_DECREF(x);
5039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 }
5041}
5042
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005043static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005044charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005045{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005046 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5047 /* exponentially overallocate to minimize reallocations */
5048 if (requiredsize < 2*outsize)
5049 requiredsize = 2*outsize;
5050 if (_PyBytes_Resize(outobj, requiredsize))
5051 return -1;
5052 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005053}
5054
Benjamin Peterson14339b62009-01-31 16:36:08 +00005055typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005057}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005059 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 space is available. Return a new reference to the object that
5061 was put in the output buffer, or Py_None, if the mapping was undefined
5062 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005063 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005064static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005065charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005067{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005068 PyObject *rep;
5069 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005070 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005071
Christian Heimes90aa7642007-12-19 02:45:37 +00005072 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005073 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005075 if (res == -1)
5076 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 if (outsize<requiredsize)
5078 if (charmapencode_resize(outobj, outpos, requiredsize))
5079 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005080 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 outstart[(*outpos)++] = (char)res;
5082 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005083 }
5084
5085 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005086 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005088 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 Py_DECREF(rep);
5090 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005091 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 if (PyLong_Check(rep)) {
5093 Py_ssize_t requiredsize = *outpos+1;
5094 if (outsize<requiredsize)
5095 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5096 Py_DECREF(rep);
5097 return enc_EXCEPTION;
5098 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005099 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005101 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 else {
5103 const char *repchars = PyBytes_AS_STRING(rep);
5104 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5105 Py_ssize_t requiredsize = *outpos+repsize;
5106 if (outsize<requiredsize)
5107 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5108 Py_DECREF(rep);
5109 return enc_EXCEPTION;
5110 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005111 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 memcpy(outstart + *outpos, repchars, repsize);
5113 *outpos += repsize;
5114 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005116 Py_DECREF(rep);
5117 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118}
5119
5120/* handle an error in PyUnicode_EncodeCharmap
5121 Return 0 on success, -1 on error */
5122static
5123int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005124 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005125 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005126 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005127 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128{
5129 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005130 Py_ssize_t repsize;
5131 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 Py_UNICODE *uni2;
5133 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005134 Py_ssize_t collstartpos = *inpos;
5135 Py_ssize_t collendpos = *inpos+1;
5136 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 char *encoding = "charmap";
5138 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005139 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 /* find all unencodable characters */
5142 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005143 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005144 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 int res = encoding_map_lookup(p[collendpos], mapping);
5146 if (res != -1)
5147 break;
5148 ++collendpos;
5149 continue;
5150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005151
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 rep = charmapencode_lookup(p[collendpos], mapping);
5153 if (rep==NULL)
5154 return -1;
5155 else if (rep!=Py_None) {
5156 Py_DECREF(rep);
5157 break;
5158 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005159 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005160 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005161 }
5162 /* cache callback name lookup
5163 * (if not done yet, i.e. it's the first error) */
5164 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 if ((errors==NULL) || (!strcmp(errors, "strict")))
5166 *known_errorHandler = 1;
5167 else if (!strcmp(errors, "replace"))
5168 *known_errorHandler = 2;
5169 else if (!strcmp(errors, "ignore"))
5170 *known_errorHandler = 3;
5171 else if (!strcmp(errors, "xmlcharrefreplace"))
5172 *known_errorHandler = 4;
5173 else
5174 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005175 }
5176 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005177 case 1: /* strict */
5178 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5179 return -1;
5180 case 2: /* replace */
5181 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 x = charmapencode_output('?', mapping, res, respos);
5183 if (x==enc_EXCEPTION) {
5184 return -1;
5185 }
5186 else if (x==enc_FAILED) {
5187 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5188 return -1;
5189 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005190 }
5191 /* fall through */
5192 case 3: /* ignore */
5193 *inpos = collendpos;
5194 break;
5195 case 4: /* xmlcharrefreplace */
5196 /* generate replacement (temporarily (mis)uses p) */
5197 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005198 char buffer[2+29+1+1];
5199 char *cp;
5200 sprintf(buffer, "&#%d;", (int)p[collpos]);
5201 for (cp = buffer; *cp; ++cp) {
5202 x = charmapencode_output(*cp, mapping, res, respos);
5203 if (x==enc_EXCEPTION)
5204 return -1;
5205 else if (x==enc_FAILED) {
5206 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5207 return -1;
5208 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005209 }
5210 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005211 *inpos = collendpos;
5212 break;
5213 default:
5214 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 encoding, reason, p, size, exceptionObject,
5216 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005217 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005219 if (PyBytes_Check(repunicode)) {
5220 /* Directly copy bytes result to output. */
5221 Py_ssize_t outsize = PyBytes_Size(*res);
5222 Py_ssize_t requiredsize;
5223 repsize = PyBytes_Size(repunicode);
5224 requiredsize = *respos + repsize;
5225 if (requiredsize > outsize)
5226 /* Make room for all additional bytes. */
5227 if (charmapencode_resize(res, respos, requiredsize)) {
5228 Py_DECREF(repunicode);
5229 return -1;
5230 }
5231 memcpy(PyBytes_AsString(*res) + *respos,
5232 PyBytes_AsString(repunicode), repsize);
5233 *respos += repsize;
5234 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005235 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005236 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005237 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005238 /* generate replacement */
5239 repsize = PyUnicode_GET_SIZE(repunicode);
5240 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 x = charmapencode_output(*uni2, mapping, res, respos);
5242 if (x==enc_EXCEPTION) {
5243 return -1;
5244 }
5245 else if (x==enc_FAILED) {
5246 Py_DECREF(repunicode);
5247 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5248 return -1;
5249 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005250 }
5251 *inpos = newpos;
5252 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 }
5254 return 0;
5255}
5256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 Py_ssize_t size,
5259 PyObject *mapping,
5260 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005262 /* output object */
5263 PyObject *res = NULL;
5264 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005267 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 PyObject *errorHandler = NULL;
5269 PyObject *exc = NULL;
5270 /* the following variable is used for caching string comparisons
5271 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5272 * 3=ignore, 4=xmlcharrefreplace */
5273 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
5275 /* Default to Latin-1 */
5276 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279 /* allocate enough for a simple encoding without
5280 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005281 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 if (res == NULL)
5283 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005284 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 /* try to encode it */
5289 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5290 if (x==enc_EXCEPTION) /* error */
5291 goto onError;
5292 if (x==enc_FAILED) { /* unencodable character */
5293 if (charmap_encoding_error(p, size, &inpos, mapping,
5294 &exc,
5295 &known_errorHandler, &errorHandler, errors,
5296 &res, &respos)) {
5297 goto onError;
5298 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005299 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 else
5301 /* done with this character => adjust input position */
5302 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005306 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005307 if (_PyBytes_Resize(&res, respos) < 0)
5308 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005309
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310 Py_XDECREF(exc);
5311 Py_XDECREF(errorHandler);
5312 return res;
5313
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005315 Py_XDECREF(res);
5316 Py_XDECREF(exc);
5317 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 return NULL;
5319}
5320
5321PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323{
5324 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 PyErr_BadArgument();
5326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 }
5328 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 PyUnicode_GET_SIZE(unicode),
5330 mapping,
5331 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332}
5333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334/* create or adjust a UnicodeTranslateError */
5335static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 const Py_UNICODE *unicode, Py_ssize_t size,
5337 Py_ssize_t startpos, Py_ssize_t endpos,
5338 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005341 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 }
5344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5346 goto onError;
5347 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5348 goto onError;
5349 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5350 goto onError;
5351 return;
5352 onError:
5353 Py_DECREF(*exceptionObject);
5354 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 }
5356}
5357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005358/* raises a UnicodeTranslateError */
5359static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 const Py_UNICODE *unicode, Py_ssize_t size,
5361 Py_ssize_t startpos, Py_ssize_t endpos,
5362 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363{
5364 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368}
5369
5370/* error handling callback helper:
5371 build arguments, call the callback and check the arguments,
5372 put the result into newpos and return the replacement string, which
5373 has to be freed by the caller */
5374static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 PyObject **errorHandler,
5376 const char *reason,
5377 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5378 Py_ssize_t startpos, Py_ssize_t endpos,
5379 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005381 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005383 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 PyObject *restuple;
5385 PyObject *resunicode;
5386
5387 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 }
5392
5393 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397
5398 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005402 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005403 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 Py_DECREF(restuple);
5405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005406 }
5407 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 &resunicode, &i_newpos)) {
5409 Py_DECREF(restuple);
5410 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005412 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005414 else
5415 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005416 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5418 Py_DECREF(restuple);
5419 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005420 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005421 Py_INCREF(resunicode);
5422 Py_DECREF(restuple);
5423 return resunicode;
5424}
5425
5426/* Lookup the character ch in the mapping and put the result in result,
5427 which must be decrefed by the caller.
5428 Return 0 on success, -1 on error */
5429static
5430int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5431{
Christian Heimes217cfd12007-12-02 14:31:20 +00005432 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 PyObject *x;
5434
5435 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 x = PyObject_GetItem(mapping, w);
5438 Py_DECREF(w);
5439 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5441 /* No mapping found means: use 1:1 mapping. */
5442 PyErr_Clear();
5443 *result = NULL;
5444 return 0;
5445 } else
5446 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 }
5448 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 *result = x;
5450 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005451 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005452 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 long value = PyLong_AS_LONG(x);
5454 long max = PyUnicode_GetMax();
5455 if (value < 0 || value > max) {
5456 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005457 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 Py_DECREF(x);
5459 return -1;
5460 }
5461 *result = x;
5462 return 0;
5463 }
5464 else if (PyUnicode_Check(x)) {
5465 *result = x;
5466 return 0;
5467 }
5468 else {
5469 /* wrong return value */
5470 PyErr_SetString(PyExc_TypeError,
5471 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005472 Py_DECREF(x);
5473 return -1;
5474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475}
5476/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 if not reallocate and adjust various state variables.
5478 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479static
Walter Dörwald4894c302003-10-24 14:25:28 +00005480int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005483 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005484 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 /* remember old output position */
5486 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5487 /* exponentially overallocate to minimize reallocations */
5488 if (requiredsize < 2 * oldsize)
5489 requiredsize = 2 * oldsize;
5490 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5491 return -1;
5492 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 }
5494 return 0;
5495}
5496/* lookup the character, put the result in the output string and adjust
5497 various state variables. Return a new reference to the object that
5498 was put in the output buffer in *result, or Py_None, if the mapping was
5499 undefined (in which case no character was written).
5500 The called must decref result.
5501 Return 0 on success, -1 on error. */
5502static
Walter Dörwald4894c302003-10-24 14:25:28 +00005503int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5505 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506{
Walter Dörwald4894c302003-10-24 14:25:28 +00005507 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 /* not found => default to 1:1 mapping */
5511 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 }
5513 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005515 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 /* no overflow check, because we know that the space is enough */
5517 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 }
5519 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5521 if (repsize==1) {
5522 /* no overflow check, because we know that the space is enough */
5523 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5524 }
5525 else if (repsize!=0) {
5526 /* more than one character */
5527 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5528 (insize - (curinp-startinp)) +
5529 repsize - 1;
5530 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5531 return -1;
5532 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5533 *outp += repsize;
5534 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 }
5536 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 return 0;
5539}
5540
5541PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 Py_ssize_t size,
5543 PyObject *mapping,
5544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 /* output object */
5547 PyObject *res = NULL;
5548 /* pointers to the beginning and end+1 of input */
5549 const Py_UNICODE *startp = p;
5550 const Py_UNICODE *endp = p + size;
5551 /* pointer into the output */
5552 Py_UNICODE *str;
5553 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005554 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555 char *reason = "character maps to <undefined>";
5556 PyObject *errorHandler = NULL;
5557 PyObject *exc = NULL;
5558 /* the following variable is used for caching string comparisons
5559 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5560 * 3=ignore, 4=xmlcharrefreplace */
5561 int known_errorHandler = -1;
5562
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 PyErr_BadArgument();
5565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567
5568 /* allocate enough for a simple 1:1 translation without
5569 replacements, if we need more, we'll resize */
5570 res = PyUnicode_FromUnicode(NULL, size);
5571 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005575 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 /* try to encode it */
5579 PyObject *x = NULL;
5580 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5581 Py_XDECREF(x);
5582 goto onError;
5583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005584 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 if (x!=Py_None) /* it worked => adjust input pointer */
5586 ++p;
5587 else { /* untranslatable character */
5588 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5589 Py_ssize_t repsize;
5590 Py_ssize_t newpos;
5591 Py_UNICODE *uni2;
5592 /* startpos for collecting untranslatable chars */
5593 const Py_UNICODE *collstart = p;
5594 const Py_UNICODE *collend = p+1;
5595 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 /* find all untranslatable characters */
5598 while (collend < endp) {
5599 if (charmaptranslate_lookup(*collend, mapping, &x))
5600 goto onError;
5601 Py_XDECREF(x);
5602 if (x!=Py_None)
5603 break;
5604 ++collend;
5605 }
5606 /* cache callback name lookup
5607 * (if not done yet, i.e. it's the first error) */
5608 if (known_errorHandler==-1) {
5609 if ((errors==NULL) || (!strcmp(errors, "strict")))
5610 known_errorHandler = 1;
5611 else if (!strcmp(errors, "replace"))
5612 known_errorHandler = 2;
5613 else if (!strcmp(errors, "ignore"))
5614 known_errorHandler = 3;
5615 else if (!strcmp(errors, "xmlcharrefreplace"))
5616 known_errorHandler = 4;
5617 else
5618 known_errorHandler = 0;
5619 }
5620 switch (known_errorHandler) {
5621 case 1: /* strict */
5622 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005623 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 case 2: /* replace */
5625 /* No need to check for space, this is a 1:1 replacement */
5626 for (coll = collstart; coll<collend; ++coll)
5627 *str++ = '?';
5628 /* fall through */
5629 case 3: /* ignore */
5630 p = collend;
5631 break;
5632 case 4: /* xmlcharrefreplace */
5633 /* generate replacement (temporarily (mis)uses p) */
5634 for (p = collstart; p < collend; ++p) {
5635 char buffer[2+29+1+1];
5636 char *cp;
5637 sprintf(buffer, "&#%d;", (int)*p);
5638 if (charmaptranslate_makespace(&res, &str,
5639 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5640 goto onError;
5641 for (cp = buffer; *cp; ++cp)
5642 *str++ = *cp;
5643 }
5644 p = collend;
5645 break;
5646 default:
5647 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5648 reason, startp, size, &exc,
5649 collstart-startp, collend-startp, &newpos);
5650 if (repunicode == NULL)
5651 goto onError;
5652 /* generate replacement */
5653 repsize = PyUnicode_GET_SIZE(repunicode);
5654 if (charmaptranslate_makespace(&res, &str,
5655 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5656 Py_DECREF(repunicode);
5657 goto onError;
5658 }
5659 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5660 *str++ = *uni2;
5661 p = startp + newpos;
5662 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005663 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005664 }
5665 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666 /* Resize if we allocated to much */
5667 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005668 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 if (PyUnicode_Resize(&res, respos) < 0)
5670 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 }
5672 Py_XDECREF(exc);
5673 Py_XDECREF(errorHandler);
5674 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 Py_XDECREF(res);
5678 Py_XDECREF(exc);
5679 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 return NULL;
5681}
5682
5683PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 PyObject *mapping,
5685 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686{
5687 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 str = PyUnicode_FromObject(str);
5690 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 PyUnicode_GET_SIZE(str),
5694 mapping,
5695 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 Py_DECREF(str);
5697 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 Py_XDECREF(str);
5701 return NULL;
5702}
Tim Petersced69f82003-09-16 20:30:58 +00005703
Guido van Rossum9e896b32000-04-05 20:11:21 +00005704/* --- Decimal Encoder ---------------------------------------------------- */
5705
5706int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 Py_ssize_t length,
5708 char *output,
5709 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005710{
5711 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 PyObject *errorHandler = NULL;
5713 PyObject *exc = NULL;
5714 const char *encoding = "decimal";
5715 const char *reason = "invalid decimal Unicode string";
5716 /* the following variable is used for caching string comparisons
5717 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5718 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005719
5720 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 PyErr_BadArgument();
5722 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005723 }
5724
5725 p = s;
5726 end = s + length;
5727 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 register Py_UNICODE ch = *p;
5729 int decimal;
5730 PyObject *repunicode;
5731 Py_ssize_t repsize;
5732 Py_ssize_t newpos;
5733 Py_UNICODE *uni2;
5734 Py_UNICODE *collstart;
5735 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005736
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005738 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 ++p;
5740 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 decimal = Py_UNICODE_TODECIMAL(ch);
5743 if (decimal >= 0) {
5744 *output++ = '0' + decimal;
5745 ++p;
5746 continue;
5747 }
5748 if (0 < ch && ch < 256) {
5749 *output++ = (char)ch;
5750 ++p;
5751 continue;
5752 }
5753 /* All other characters are considered unencodable */
5754 collstart = p;
5755 collend = p+1;
5756 while (collend < end) {
5757 if ((0 < *collend && *collend < 256) ||
5758 !Py_UNICODE_ISSPACE(*collend) ||
5759 Py_UNICODE_TODECIMAL(*collend))
5760 break;
5761 }
5762 /* cache callback name lookup
5763 * (if not done yet, i.e. it's the first error) */
5764 if (known_errorHandler==-1) {
5765 if ((errors==NULL) || (!strcmp(errors, "strict")))
5766 known_errorHandler = 1;
5767 else if (!strcmp(errors, "replace"))
5768 known_errorHandler = 2;
5769 else if (!strcmp(errors, "ignore"))
5770 known_errorHandler = 3;
5771 else if (!strcmp(errors, "xmlcharrefreplace"))
5772 known_errorHandler = 4;
5773 else
5774 known_errorHandler = 0;
5775 }
5776 switch (known_errorHandler) {
5777 case 1: /* strict */
5778 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5779 goto onError;
5780 case 2: /* replace */
5781 for (p = collstart; p < collend; ++p)
5782 *output++ = '?';
5783 /* fall through */
5784 case 3: /* ignore */
5785 p = collend;
5786 break;
5787 case 4: /* xmlcharrefreplace */
5788 /* generate replacement (temporarily (mis)uses p) */
5789 for (p = collstart; p < collend; ++p)
5790 output += sprintf(output, "&#%d;", (int)*p);
5791 p = collend;
5792 break;
5793 default:
5794 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5795 encoding, reason, s, length, &exc,
5796 collstart-s, collend-s, &newpos);
5797 if (repunicode == NULL)
5798 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005799 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005800 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005801 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5802 Py_DECREF(repunicode);
5803 goto onError;
5804 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 /* generate replacement */
5806 repsize = PyUnicode_GET_SIZE(repunicode);
5807 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5808 Py_UNICODE ch = *uni2;
5809 if (Py_UNICODE_ISSPACE(ch))
5810 *output++ = ' ';
5811 else {
5812 decimal = Py_UNICODE_TODECIMAL(ch);
5813 if (decimal >= 0)
5814 *output++ = '0' + decimal;
5815 else if (0 < ch && ch < 256)
5816 *output++ = (char)ch;
5817 else {
5818 Py_DECREF(repunicode);
5819 raise_encode_exception(&exc, encoding,
5820 s, length, collstart-s, collend-s, reason);
5821 goto onError;
5822 }
5823 }
5824 }
5825 p = s + newpos;
5826 Py_DECREF(repunicode);
5827 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005828 }
5829 /* 0-terminate the output string */
5830 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 Py_XDECREF(exc);
5832 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005833 return 0;
5834
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 Py_XDECREF(exc);
5837 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005838 return -1;
5839}
5840
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841/* --- Helpers ------------------------------------------------------------ */
5842
Eric Smith8c663262007-08-25 02:26:07 +00005843#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005844#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005845#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005846/* Include _ParseTupleFinds from find.h */
5847#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005848#include "stringlib/find.h"
5849#include "stringlib/partition.h"
5850
Eric Smith5807c412008-05-11 21:00:57 +00005851#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005852#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005853#include "stringlib/localeutil.h"
5854
Thomas Wouters477c8d52006-05-27 19:21:47 +00005855/* helper macro to fixup start/end slice values */
5856#define FIX_START_END(obj) \
5857 if (start < 0) \
5858 start += (obj)->length; \
5859 if (start < 0) \
5860 start = 0; \
5861 if (end > (obj)->length) \
5862 end = (obj)->length; \
5863 if (end < 0) \
5864 end += (obj)->length; \
5865 if (end < 0) \
5866 end = 0;
5867
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005869 PyObject *substr,
5870 Py_ssize_t start,
5871 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005873 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005874 PyUnicodeObject* str_obj;
5875 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005876
Thomas Wouters477c8d52006-05-27 19:21:47 +00005877 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5878 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005880 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5881 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 Py_DECREF(str_obj);
5883 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 }
Tim Petersced69f82003-09-16 20:30:58 +00005885
Thomas Wouters477c8d52006-05-27 19:21:47 +00005886 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005887
Thomas Wouters477c8d52006-05-27 19:21:47 +00005888 result = stringlib_count(
5889 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5890 );
5891
5892 Py_DECREF(sub_obj);
5893 Py_DECREF(str_obj);
5894
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 return result;
5896}
5897
Martin v. Löwis18e16552006-02-15 17:27:45 +00005898Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005899 PyObject *sub,
5900 Py_ssize_t start,
5901 Py_ssize_t end,
5902 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005904 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005905
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005907 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005909 sub = PyUnicode_FromObject(sub);
5910 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 Py_DECREF(str);
5912 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 }
Tim Petersced69f82003-09-16 20:30:58 +00005914
Thomas Wouters477c8d52006-05-27 19:21:47 +00005915 if (direction > 0)
5916 result = stringlib_find_slice(
5917 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5918 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5919 start, end
5920 );
5921 else
5922 result = stringlib_rfind_slice(
5923 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5924 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5925 start, end
5926 );
5927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005929 Py_DECREF(sub);
5930
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 return result;
5932}
5933
Tim Petersced69f82003-09-16 20:30:58 +00005934static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 PyUnicodeObject *substring,
5937 Py_ssize_t start,
5938 Py_ssize_t end,
5939 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 if (substring->length == 0)
5942 return 1;
5943
Thomas Wouters477c8d52006-05-27 19:21:47 +00005944 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
5946 end -= substring->length;
5947 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949
5950 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 if (Py_UNICODE_MATCH(self, end, substring))
5952 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 } else {
5954 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 }
5957
5958 return 0;
5959}
5960
Martin v. Löwis18e16552006-02-15 17:27:45 +00005961Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 PyObject *substr,
5963 Py_ssize_t start,
5964 Py_ssize_t end,
5965 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005967 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005968
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 str = PyUnicode_FromObject(str);
5970 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 substr = PyUnicode_FromObject(substr);
5973 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 Py_DECREF(str);
5975 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 }
Tim Petersced69f82003-09-16 20:30:58 +00005977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 (PyUnicodeObject *)substr,
5980 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 Py_DECREF(str);
5982 Py_DECREF(substr);
5983 return result;
5984}
5985
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986/* Apply fixfct filter to the Unicode object self and return a
5987 reference to the modified object */
5988
Tim Petersced69f82003-09-16 20:30:58 +00005989static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992{
5993
5994 PyUnicodeObject *u;
5995
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005996 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005999
6000 Py_UNICODE_COPY(u->str, self->str, self->length);
6001
Tim Peters7a29bd52001-09-12 03:03:31 +00006002 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* fixfct should return TRUE if it modified the buffer. If
6004 FALSE, return a reference to the original buffer instead
6005 (to save space, not time) */
6006 Py_INCREF(self);
6007 Py_DECREF(u);
6008 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 }
6010 return (PyObject*) u;
6011}
6012
Tim Petersced69f82003-09-16 20:30:58 +00006013static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014int fixupper(PyUnicodeObject *self)
6015{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006016 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 Py_UNICODE *s = self->str;
6018 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006022
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 ch = Py_UNICODE_TOUPPER(*s);
6024 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 *s = ch;
6027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 s++;
6029 }
6030
6031 return status;
6032}
6033
Tim Petersced69f82003-09-16 20:30:58 +00006034static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035int fixlower(PyUnicodeObject *self)
6036{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006037 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 Py_UNICODE *s = self->str;
6039 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006040
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006043
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 ch = Py_UNICODE_TOLOWER(*s);
6045 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 *s = ch;
6048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 s++;
6050 }
6051
6052 return status;
6053}
6054
Tim Petersced69f82003-09-16 20:30:58 +00006055static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056int fixswapcase(PyUnicodeObject *self)
6057{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006058 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 Py_UNICODE *s = self->str;
6060 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 while (len-- > 0) {
6063 if (Py_UNICODE_ISUPPER(*s)) {
6064 *s = Py_UNICODE_TOLOWER(*s);
6065 status = 1;
6066 } else if (Py_UNICODE_ISLOWER(*s)) {
6067 *s = Py_UNICODE_TOUPPER(*s);
6068 status = 1;
6069 }
6070 s++;
6071 }
6072
6073 return status;
6074}
6075
Tim Petersced69f82003-09-16 20:30:58 +00006076static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077int fixcapitalize(PyUnicodeObject *self)
6078{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006079 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006080 Py_UNICODE *s = self->str;
6081 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006082
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006083 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006085 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 *s = Py_UNICODE_TOUPPER(*s);
6087 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006089 s++;
6090 while (--len > 0) {
6091 if (Py_UNICODE_ISUPPER(*s)) {
6092 *s = Py_UNICODE_TOLOWER(*s);
6093 status = 1;
6094 }
6095 s++;
6096 }
6097 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
6100static
6101int fixtitle(PyUnicodeObject *self)
6102{
6103 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6104 register Py_UNICODE *e;
6105 int previous_is_cased;
6106
6107 /* Shortcut for single character strings */
6108 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6110 if (*p != ch) {
6111 *p = ch;
6112 return 1;
6113 }
6114 else
6115 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 }
Tim Petersced69f82003-09-16 20:30:58 +00006117
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 e = p + PyUnicode_GET_SIZE(self);
6119 previous_is_cased = 0;
6120 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006122
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 if (previous_is_cased)
6124 *p = Py_UNICODE_TOLOWER(ch);
6125 else
6126 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006127
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 if (Py_UNICODE_ISLOWER(ch) ||
6129 Py_UNICODE_ISUPPER(ch) ||
6130 Py_UNICODE_ISTITLE(ch))
6131 previous_is_cased = 1;
6132 else
6133 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 }
6135 return 1;
6136}
6137
Tim Peters8ce9f162004-08-27 01:49:32 +00006138PyObject *
6139PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140{
Skip Montanaro6543b452004-09-16 03:28:13 +00006141 const Py_UNICODE blank = ' ';
6142 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006143 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006144 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006145 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6146 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006147 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6148 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006149 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006150 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Tim Peters05eba1f2004-08-27 21:32:02 +00006152 fseq = PySequence_Fast(seq, "");
6153 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006154 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006155 }
6156
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006157 /* NOTE: the following code can't call back into Python code,
6158 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006159 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006160
Tim Peters05eba1f2004-08-27 21:32:02 +00006161 seqlen = PySequence_Fast_GET_SIZE(fseq);
6162 /* If empty sequence, return u"". */
6163 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006164 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6165 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006166 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006167 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006168 /* If singleton sequence with an exact Unicode, return that. */
6169 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 item = items[0];
6171 if (PyUnicode_CheckExact(item)) {
6172 Py_INCREF(item);
6173 res = (PyUnicodeObject *)item;
6174 goto Done;
6175 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006176 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006177 else {
6178 /* Set up sep and seplen */
6179 if (separator == NULL) {
6180 sep = &blank;
6181 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006182 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006183 else {
6184 if (!PyUnicode_Check(separator)) {
6185 PyErr_Format(PyExc_TypeError,
6186 "separator: expected str instance,"
6187 " %.80s found",
6188 Py_TYPE(separator)->tp_name);
6189 goto onError;
6190 }
6191 sep = PyUnicode_AS_UNICODE(separator);
6192 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006193 }
6194 }
6195
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006196 /* There are at least two things to join, or else we have a subclass
6197 * of str in the sequence.
6198 * Do a pre-pass to figure out the total amount of space we'll
6199 * need (sz), and see whether all argument are strings.
6200 */
6201 sz = 0;
6202 for (i = 0; i < seqlen; i++) {
6203 const Py_ssize_t old_sz = sz;
6204 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 if (!PyUnicode_Check(item)) {
6206 PyErr_Format(PyExc_TypeError,
6207 "sequence item %zd: expected str instance,"
6208 " %.80s found",
6209 i, Py_TYPE(item)->tp_name);
6210 goto onError;
6211 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006212 sz += PyUnicode_GET_SIZE(item);
6213 if (i != 0)
6214 sz += seplen;
6215 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6216 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006218 goto onError;
6219 }
6220 }
Tim Petersced69f82003-09-16 20:30:58 +00006221
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006222 res = _PyUnicode_New(sz);
6223 if (res == NULL)
6224 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006225
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006226 /* Catenate everything. */
6227 res_p = PyUnicode_AS_UNICODE(res);
6228 for (i = 0; i < seqlen; ++i) {
6229 Py_ssize_t itemlen;
6230 item = items[i];
6231 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 /* Copy item, and maybe the separator. */
6233 if (i) {
6234 Py_UNICODE_COPY(res_p, sep, seplen);
6235 res_p += seplen;
6236 }
6237 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6238 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006239 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006240
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006242 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 return (PyObject *)res;
6244
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006246 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006247 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 return NULL;
6249}
6250
Tim Petersced69f82003-09-16 20:30:58 +00006251static
6252PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 Py_ssize_t left,
6254 Py_ssize_t right,
6255 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256{
6257 PyUnicodeObject *u;
6258
6259 if (left < 0)
6260 left = 0;
6261 if (right < 0)
6262 right = 0;
6263
Tim Peters7a29bd52001-09-12 03:03:31 +00006264 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 Py_INCREF(self);
6266 return self;
6267 }
6268
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006269 if (left > PY_SSIZE_T_MAX - self->length ||
6270 right > PY_SSIZE_T_MAX - (left + self->length)) {
6271 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6272 return NULL;
6273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 u = _PyUnicode_New(left + self->length + right);
6275 if (u) {
6276 if (left)
6277 Py_UNICODE_FILL(u->str, fill, left);
6278 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6279 if (right)
6280 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6281 }
6282
6283 return u;
6284}
6285
Benjamin Peterson29060642009-01-31 22:14:21 +00006286#define SPLIT_APPEND(data, left, right) \
6287 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
6288 if (!str) \
6289 goto onError; \
6290 if (PyList_Append(list, str)) { \
6291 Py_DECREF(str); \
6292 goto onError; \
6293 } \
6294 else \
6295 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296
6297static
6298PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 PyObject *list,
6300 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006302 register Py_ssize_t i;
6303 register Py_ssize_t j;
6304 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006306 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307
6308 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006310 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 i++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006312 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6314 i++;
6315 if (j < i) {
6316 if (maxcount-- <= 0)
6317 break;
6318 SPLIT_APPEND(buf, j, i);
6319 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6320 i++;
6321 j = i;
6322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
6324 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326 }
6327 return list;
6328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 Py_DECREF(list);
6331 return NULL;
6332}
6333
6334PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337 register Py_ssize_t i;
6338 register Py_ssize_t j;
6339 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 PyObject *list;
6341 PyObject *str;
6342 Py_UNICODE *data;
6343
6344 string = PyUnicode_FromObject(string);
6345 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 data = PyUnicode_AS_UNICODE(string);
6348 len = PyUnicode_GET_SIZE(string);
6349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 list = PyList_New(0);
6351 if (!list)
6352 goto onError;
6353
6354 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006356
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 /* Find a line and append it */
6358 while (i < len && !BLOOM_LINEBREAK(data[i]))
6359 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006362 eol = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 if (i < len) {
6364 if (data[i] == '\r' && i + 1 < len &&
6365 data[i+1] == '\n')
6366 i += 2;
6367 else
6368 i++;
6369 if (keepends)
6370 eol = i;
6371 }
6372 SPLIT_APPEND(data, j, eol);
6373 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 }
6375 if (j < len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 }
6378
6379 Py_DECREF(string);
6380 return list;
6381
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006383 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 Py_DECREF(string);
6385 return NULL;
6386}
6387
Tim Petersced69f82003-09-16 20:30:58 +00006388static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389PyObject *split_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 PyObject *list,
6391 Py_UNICODE ch,
6392 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006394 register Py_ssize_t i;
6395 register Py_ssize_t j;
6396 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006398 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399
6400 for (i = j = 0; i < len; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 if (buf[i] == ch) {
6402 if (maxcount-- <= 0)
6403 break;
6404 SPLIT_APPEND(buf, j, i);
6405 i = j = i + 1;
6406 } else
6407 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 }
6409 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 }
6412 return list;
6413
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 Py_DECREF(list);
6416 return NULL;
6417}
6418
Tim Petersced69f82003-09-16 20:30:58 +00006419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420PyObject *split_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 PyObject *list,
6422 PyUnicodeObject *substring,
6423 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006425 register Py_ssize_t i;
6426 register Py_ssize_t j;
6427 Py_ssize_t len = self->length;
6428 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 PyObject *str;
6430
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006431 for (i = j = 0; i <= len - sublen; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 if (Py_UNICODE_MATCH(self, i, substring)) {
6433 if (maxcount-- <= 0)
6434 break;
6435 SPLIT_APPEND(self->str, j, i);
6436 i = j = i + sublen;
6437 } else
6438 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 }
6440 if (j <= len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 }
6443 return list;
6444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 Py_DECREF(list);
6447 return NULL;
6448}
6449
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006450static
6451PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 PyObject *list,
6453 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 register Py_ssize_t i;
6456 register Py_ssize_t j;
6457 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006458 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006459 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006460
6461 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 /* find a token */
Benjamin Peterson14339b62009-01-31 16:36:08 +00006463 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 i--;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006465 j = i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6467 i--;
6468 if (j > i) {
6469 if (maxcount-- <= 0)
6470 break;
6471 SPLIT_APPEND(buf, i + 1, j + 1);
6472 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6473 i--;
6474 j = i;
6475 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006476 }
6477 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006479 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006480 if (PyList_Reverse(list) < 0)
6481 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006482 return list;
6483
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006485 Py_DECREF(list);
6486 return NULL;
6487}
6488
Benjamin Peterson14339b62009-01-31 16:36:08 +00006489static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006490PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 PyObject *list,
6492 Py_UNICODE ch,
6493 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006494{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495 register Py_ssize_t i;
6496 register Py_ssize_t j;
6497 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006498 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006499 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006500
6501 for (i = j = len - 1; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 if (buf[i] == ch) {
6503 if (maxcount-- <= 0)
6504 break;
6505 SPLIT_APPEND(buf, i + 1, j + 1);
6506 j = i = i - 1;
6507 } else
6508 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006509 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006510 if (j >= -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006512 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006513 if (PyList_Reverse(list) < 0)
6514 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006515 return list;
6516
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006518 Py_DECREF(list);
6519 return NULL;
6520}
6521
Benjamin Peterson14339b62009-01-31 16:36:08 +00006522static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006523PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 PyObject *list,
6525 PyUnicodeObject *substring,
6526 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 register Py_ssize_t i;
6529 register Py_ssize_t j;
6530 Py_ssize_t len = self->length;
6531 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006532 PyObject *str;
6533
6534 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 if (Py_UNICODE_MATCH(self, i, substring)) {
6536 if (maxcount-- <= 0)
6537 break;
6538 SPLIT_APPEND(self->str, i + sublen, j);
6539 j = i;
6540 i -= sublen;
6541 } else
6542 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006543 }
6544 if (j >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006546 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006547 if (PyList_Reverse(list) < 0)
6548 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006549 return list;
6550
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006552 Py_DECREF(list);
6553 return NULL;
6554}
6555
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556#undef SPLIT_APPEND
6557
6558static
6559PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 PyUnicodeObject *substring,
6561 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562{
6563 PyObject *list;
6564
6565 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006566 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
6568 list = PyList_New(0);
6569 if (!list)
6570 return NULL;
6571
6572 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
6575 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
6578 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 Py_DECREF(list);
6580 PyErr_SetString(PyExc_ValueError, "empty separator");
6581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 }
6583 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585}
6586
Tim Petersced69f82003-09-16 20:30:58 +00006587static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006588PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 PyUnicodeObject *substring,
6590 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006591{
6592 PyObject *list;
6593
6594 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006595 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006596
6597 list = PyList_New(0);
6598 if (!list)
6599 return NULL;
6600
6601 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006603
6604 else if (substring->length == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006606
6607 else if (substring->length == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 Py_DECREF(list);
6609 PyErr_SetString(PyExc_ValueError, "empty separator");
6610 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006611 }
6612 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006614}
6615
6616static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 PyUnicodeObject *str1,
6619 PyUnicodeObject *str2,
6620 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621{
6622 PyUnicodeObject *u;
6623
6624 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
Thomas Wouters477c8d52006-05-27 19:21:47 +00006627 if (str1->length == str2->length) {
6628 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006629 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006630 if (str1->length == 1) {
6631 /* replace characters */
6632 Py_UNICODE u1, u2;
6633 if (!findchar(self->str, self->length, str1->str[0]))
6634 goto nothing;
6635 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6636 if (!u)
6637 return NULL;
6638 Py_UNICODE_COPY(u->str, self->str, self->length);
6639 u1 = str1->str[0];
6640 u2 = str2->str[0];
6641 for (i = 0; i < u->length; i++)
6642 if (u->str[i] == u1) {
6643 if (--maxcount < 0)
6644 break;
6645 u->str[i] = u2;
6646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006648 i = fastsearch(
6649 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006651 if (i < 0)
6652 goto nothing;
6653 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6654 if (!u)
6655 return NULL;
6656 Py_UNICODE_COPY(u->str, self->str, self->length);
6657 while (i <= self->length - str1->length)
6658 if (Py_UNICODE_MATCH(self, i, str1)) {
6659 if (--maxcount < 0)
6660 break;
6661 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6662 i += str1->length;
6663 } else
6664 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006667
6668 Py_ssize_t n, i, j, e;
6669 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 Py_UNICODE *p;
6671
6672 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006673 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 if (n > maxcount)
6675 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676 if (n == 0)
6677 goto nothing;
6678 /* new_size = self->length + n * (str2->length - str1->length)); */
6679 delta = (str2->length - str1->length);
6680 if (delta == 0) {
6681 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006683 product = n * (str2->length - str1->length);
6684 if ((product / (str2->length - str1->length)) != n) {
6685 PyErr_SetString(PyExc_OverflowError,
6686 "replace string is too long");
6687 return NULL;
6688 }
6689 new_size = self->length + product;
6690 if (new_size < 0) {
6691 PyErr_SetString(PyExc_OverflowError,
6692 "replace string is too long");
6693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 }
6695 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696 u = _PyUnicode_New(new_size);
6697 if (!u)
6698 return NULL;
6699 i = 0;
6700 p = u->str;
6701 e = self->length - str1->length;
6702 if (str1->length > 0) {
6703 while (n-- > 0) {
6704 /* look for next match */
6705 j = i;
6706 while (j <= e) {
6707 if (Py_UNICODE_MATCH(self, j, str1))
6708 break;
6709 j++;
6710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006712 if (j > e)
6713 break;
6714 /* copy unchanged part [i:j] */
6715 Py_UNICODE_COPY(p, self->str+i, j-i);
6716 p += j - i;
6717 }
6718 /* copy substitution string */
6719 if (str2->length > 0) {
6720 Py_UNICODE_COPY(p, str2->str, str2->length);
6721 p += str2->length;
6722 }
6723 i = j + str1->length;
6724 }
6725 if (i < self->length)
6726 /* copy tail [i:] */
6727 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6728 } else {
6729 /* interleave */
6730 while (n > 0) {
6731 Py_UNICODE_COPY(p, str2->str, str2->length);
6732 p += str2->length;
6733 if (--n <= 0)
6734 break;
6735 *p++ = self->str[i++];
6736 }
6737 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6738 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006741
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006743 /* nothing to replace; return original string (when possible) */
6744 if (PyUnicode_CheckExact(self)) {
6745 Py_INCREF(self);
6746 return (PyObject *) self;
6747 }
6748 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749}
6750
6751/* --- Unicode Object Methods --------------------------------------------- */
6752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006753PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755\n\
6756Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006757characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758
6759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006760unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 return fixup(self, fixtitle);
6763}
6764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006765PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767\n\
6768Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006769have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770
6771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006772unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 return fixup(self, fixcapitalize);
6775}
6776
6777#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006778PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780\n\
6781Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
6784static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006785unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786{
6787 PyObject *list;
6788 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006789 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 /* Split into words */
6792 list = split(self, NULL, -1);
6793 if (!list)
6794 return NULL;
6795
6796 /* Capitalize each word */
6797 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6798 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800 if (item == NULL)
6801 goto onError;
6802 Py_DECREF(PyList_GET_ITEM(list, i));
6803 PyList_SET_ITEM(list, i, item);
6804 }
6805
6806 /* Join the words to form a new string */
6807 item = PyUnicode_Join(NULL, list);
6808
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 Py_DECREF(list);
6811 return (PyObject *)item;
6812}
6813#endif
6814
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006815/* Argument converter. Coerces to a single unicode character */
6816
6817static int
6818convert_uc(PyObject *obj, void *addr)
6819{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006820 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6821 PyObject *uniobj;
6822 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006823
Benjamin Peterson14339b62009-01-31 16:36:08 +00006824 uniobj = PyUnicode_FromObject(obj);
6825 if (uniobj == NULL) {
6826 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006828 return 0;
6829 }
6830 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6831 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006833 Py_DECREF(uniobj);
6834 return 0;
6835 }
6836 unistr = PyUnicode_AS_UNICODE(uniobj);
6837 *fillcharloc = unistr[0];
6838 Py_DECREF(uniobj);
6839 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006840}
6841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006842PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006845Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006846done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
6848static PyObject *
6849unicode_center(PyUnicodeObject *self, PyObject *args)
6850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006851 Py_ssize_t marg, left;
6852 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006853 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
Thomas Woutersde017742006-02-16 19:34:37 +00006855 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 return NULL;
6857
Tim Peters7a29bd52001-09-12 03:03:31 +00006858 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 Py_INCREF(self);
6860 return (PyObject*) self;
6861 }
6862
6863 marg = width - self->length;
6864 left = marg / 2 + (marg & width & 1);
6865
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006866 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867}
6868
Marc-André Lemburge5034372000-08-08 08:04:29 +00006869#if 0
6870
6871/* This code should go into some future Unicode collation support
6872 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006873 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006874
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006875/* speedy UTF-16 code point order comparison */
6876/* gleaned from: */
6877/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6878
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006879static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006880{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006881 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006882 0, 0, 0, 0, 0, 0, 0, 0,
6883 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006884 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006885};
6886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887static int
6888unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6889{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006890 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 Py_UNICODE *s1 = str1->str;
6893 Py_UNICODE *s2 = str2->str;
6894
6895 len1 = str1->length;
6896 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006897
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006899 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006900
6901 c1 = *s1++;
6902 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006903
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 if (c1 > (1<<11) * 26)
6905 c1 += utf16Fixup[c1>>11];
6906 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006907 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006908 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006909
6910 if (c1 != c2)
6911 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006912
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006913 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 }
6915
6916 return (len1 < len2) ? -1 : (len1 != len2);
6917}
6918
Marc-André Lemburge5034372000-08-08 08:04:29 +00006919#else
6920
6921static int
6922unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6923{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006924 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006925
6926 Py_UNICODE *s1 = str1->str;
6927 Py_UNICODE *s2 = str2->str;
6928
6929 len1 = str1->length;
6930 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006931
Marc-André Lemburge5034372000-08-08 08:04:29 +00006932 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006933 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006934
Fredrik Lundh45714e92001-06-26 16:39:36 +00006935 c1 = *s1++;
6936 c2 = *s2++;
6937
6938 if (c1 != c2)
6939 return (c1 < c2) ? -1 : 1;
6940
Marc-André Lemburge5034372000-08-08 08:04:29 +00006941 len1--; len2--;
6942 }
6943
6944 return (len1 < len2) ? -1 : (len1 != len2);
6945}
6946
6947#endif
6948
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006952 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6953 return unicode_compare((PyUnicodeObject *)left,
6954 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006955 PyErr_Format(PyExc_TypeError,
6956 "Can't compare %.100s and %.100s",
6957 left->ob_type->tp_name,
6958 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 return -1;
6960}
6961
Martin v. Löwis5b222132007-06-10 09:51:05 +00006962int
6963PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6964{
6965 int i;
6966 Py_UNICODE *id;
6967 assert(PyUnicode_Check(uni));
6968 id = PyUnicode_AS_UNICODE(uni);
6969 /* Compare Unicode string and source character set string */
6970 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 if (id[i] != str[i])
6972 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Petersonbb81c8c2010-01-09 21:54:39 +00006973 /* This check keeps Python strings that end in '\0' from comparing equal
6974 to C strings identical up to that point. */
6975 if (PyUnicode_GET_SIZE(uni) != i)
6976 /* We'll say the Python string is longer. */
6977 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006978 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006980 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006982 return 0;
6983}
6984
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006985
Benjamin Peterson29060642009-01-31 22:14:21 +00006986#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006987 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006988
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006989PyObject *PyUnicode_RichCompare(PyObject *left,
6990 PyObject *right,
6991 int op)
6992{
6993 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006995 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6996 PyObject *v;
6997 if (((PyUnicodeObject *) left)->length !=
6998 ((PyUnicodeObject *) right)->length) {
6999 if (op == Py_EQ) {
7000 Py_INCREF(Py_False);
7001 return Py_False;
7002 }
7003 if (op == Py_NE) {
7004 Py_INCREF(Py_True);
7005 return Py_True;
7006 }
7007 }
7008 if (left == right)
7009 result = 0;
7010 else
7011 result = unicode_compare((PyUnicodeObject *)left,
7012 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007013
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007014 /* Convert the return value to a Boolean */
7015 switch (op) {
7016 case Py_EQ:
7017 v = TEST_COND(result == 0);
7018 break;
7019 case Py_NE:
7020 v = TEST_COND(result != 0);
7021 break;
7022 case Py_LE:
7023 v = TEST_COND(result <= 0);
7024 break;
7025 case Py_GE:
7026 v = TEST_COND(result >= 0);
7027 break;
7028 case Py_LT:
7029 v = TEST_COND(result == -1);
7030 break;
7031 case Py_GT:
7032 v = TEST_COND(result == 1);
7033 break;
7034 default:
7035 PyErr_BadArgument();
7036 return NULL;
7037 }
7038 Py_INCREF(v);
7039 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007041
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007042 Py_INCREF(Py_NotImplemented);
7043 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007044}
7045
Guido van Rossum403d68b2000-03-13 15:55:09 +00007046int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007048{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007049 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007050 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007051
7052 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007053 sub = PyUnicode_FromObject(element);
7054 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 PyErr_Format(PyExc_TypeError,
7056 "'in <string>' requires string as left operand, not %s",
7057 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007058 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007059 }
7060
Thomas Wouters477c8d52006-05-27 19:21:47 +00007061 str = PyUnicode_FromObject(container);
7062 if (!str) {
7063 Py_DECREF(sub);
7064 return -1;
7065 }
7066
7067 result = stringlib_contains_obj(str, sub);
7068
7069 Py_DECREF(str);
7070 Py_DECREF(sub);
7071
Guido van Rossum403d68b2000-03-13 15:55:09 +00007072 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007073}
7074
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075/* Concat to string or Unicode object giving a new Unicode object. */
7076
7077PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079{
7080 PyUnicodeObject *u = NULL, *v = NULL, *w;
7081
7082 /* Coerce the two arguments */
7083 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7084 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7087 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089
7090 /* Shortcuts */
7091 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 Py_DECREF(v);
7093 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 }
7095 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 Py_DECREF(u);
7097 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 }
7099
7100 /* Concat the two Unicode strings */
7101 w = _PyUnicode_New(u->length + v->length);
7102 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 Py_UNICODE_COPY(w->str, u->str, u->length);
7105 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7106
7107 Py_DECREF(u);
7108 Py_DECREF(v);
7109 return (PyObject *)w;
7110
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 Py_XDECREF(u);
7113 Py_XDECREF(v);
7114 return NULL;
7115}
7116
Walter Dörwald1ab83302007-05-18 17:15:44 +00007117void
7118PyUnicode_Append(PyObject **pleft, PyObject *right)
7119{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007120 PyObject *new;
7121 if (*pleft == NULL)
7122 return;
7123 if (right == NULL || !PyUnicode_Check(*pleft)) {
7124 Py_DECREF(*pleft);
7125 *pleft = NULL;
7126 return;
7127 }
7128 new = PyUnicode_Concat(*pleft, right);
7129 Py_DECREF(*pleft);
7130 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007131}
7132
7133void
7134PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7135{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007136 PyUnicode_Append(pleft, right);
7137 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007138}
7139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007143Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007144string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147static PyObject *
7148unicode_count(PyUnicodeObject *self, PyObject *args)
7149{
7150 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007151 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007152 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 PyObject *result;
7154
Guido van Rossumb8872e62000-05-09 14:14:27 +00007155 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 return NULL;
7158
7159 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007163
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165
Christian Heimes217cfd12007-12-02 14:31:20 +00007166 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007167 stringlib_count(self->str + start, end - start,
7168 substring->str, substring->length)
7169 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170
7171 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007172
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 return result;
7174}
7175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007176PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007179Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007180to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007181handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7183'xmlcharrefreplace' as well as any other name registered with\n\
7184codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
7186static PyObject *
7187unicode_encode(PyUnicodeObject *self, PyObject *args)
7188{
7189 char *encoding = NULL;
7190 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007191 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007192
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
7194 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007195 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007196 if (v == NULL)
7197 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007198 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007199 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007200 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007201 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007202 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007203 Py_DECREF(v);
7204 return NULL;
7205 }
7206 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007207
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007209 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007210}
7211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007212PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214\n\
7215Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007216If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217
7218static PyObject*
7219unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7220{
7221 Py_UNICODE *e;
7222 Py_UNICODE *p;
7223 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007224 Py_UNICODE *qe;
7225 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226 PyUnicodeObject *u;
7227 int tabsize = 8;
7228
7229 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231
Thomas Wouters7e474022000-07-16 12:04:32 +00007232 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007233 i = 0; /* chars up to and including most recent \n or \r */
7234 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7235 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236 for (p = self->str; p < e; p++)
7237 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 if (tabsize > 0) {
7239 incr = tabsize - (j % tabsize); /* cannot overflow */
7240 if (j > PY_SSIZE_T_MAX - incr)
7241 goto overflow1;
7242 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007243 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 if (j > PY_SSIZE_T_MAX - 1)
7247 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 j++;
7249 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 if (i > PY_SSIZE_T_MAX - j)
7251 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007253 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 }
7255 }
7256
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007257 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 /* Second pass: create output string and fill it */
7261 u = _PyUnicode_New(i + j);
7262 if (!u)
7263 return NULL;
7264
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007265 j = 0; /* same as in first pass */
7266 q = u->str; /* next output char */
7267 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268
7269 for (p = self->str; p < e; p++)
7270 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 if (tabsize > 0) {
7272 i = tabsize - (j % tabsize);
7273 j += i;
7274 while (i--) {
7275 if (q >= qe)
7276 goto overflow2;
7277 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007278 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007280 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 else {
7282 if (q >= qe)
7283 goto overflow2;
7284 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007285 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286 if (*p == '\n' || *p == '\r')
7287 j = 0;
7288 }
7289
7290 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007291
7292 overflow2:
7293 Py_DECREF(u);
7294 overflow1:
7295 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297}
7298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007299PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301\n\
7302Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007303such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304arguments start and end are interpreted as in slice notation.\n\
7305\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007306Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307
7308static PyObject *
7309unicode_find(PyUnicodeObject *self, PyObject *args)
7310{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007311 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007312 Py_ssize_t start;
7313 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007314 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315
Christian Heimes9cd17752007-11-18 19:35:23 +00007316 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
Thomas Wouters477c8d52006-05-27 19:21:47 +00007319 result = stringlib_find_slice(
7320 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7321 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7322 start, end
7323 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324
7325 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007326
Christian Heimes217cfd12007-12-02 14:31:20 +00007327 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328}
7329
7330static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007331unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332{
7333 if (index < 0 || index >= self->length) {
7334 PyErr_SetString(PyExc_IndexError, "string index out of range");
7335 return NULL;
7336 }
7337
7338 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7339}
7340
Guido van Rossumc2504932007-09-18 19:42:40 +00007341/* Believe it or not, this produces the same value for ASCII strings
7342 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007344unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345{
Guido van Rossumc2504932007-09-18 19:42:40 +00007346 Py_ssize_t len;
7347 Py_UNICODE *p;
7348 long x;
7349
7350 if (self->hash != -1)
7351 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007352 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007353 p = self->str;
7354 x = *p << 7;
7355 while (--len >= 0)
7356 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007357 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007358 if (x == -1)
7359 x = -2;
7360 self->hash = x;
7361 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362}
7363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007364PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007367Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368
7369static PyObject *
7370unicode_index(PyUnicodeObject *self, PyObject *args)
7371{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007372 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007373 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007374 Py_ssize_t start;
7375 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376
Christian Heimes9cd17752007-11-18 19:35:23 +00007377 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
Thomas Wouters477c8d52006-05-27 19:21:47 +00007380 result = stringlib_find_slice(
7381 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7382 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7383 start, end
7384 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385
7386 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007387
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 if (result < 0) {
7389 PyErr_SetString(PyExc_ValueError, "substring not found");
7390 return NULL;
7391 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007392
Christian Heimes217cfd12007-12-02 14:31:20 +00007393 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394}
7395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007399Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007400at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401
7402static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007403unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404{
7405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7406 register const Py_UNICODE *e;
7407 int cased;
7408
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409 /* Shortcut for single character strings */
7410 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007413 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007414 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 e = p + PyUnicode_GET_SIZE(self);
7418 cased = 0;
7419 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007421
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7423 return PyBool_FromLong(0);
7424 else if (!cased && Py_UNICODE_ISLOWER(ch))
7425 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007427 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428}
7429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007433Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007434at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435
7436static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007437unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438{
7439 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7440 register const Py_UNICODE *e;
7441 int cased;
7442
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 /* Shortcut for single character strings */
7444 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007447 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007448 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007450
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 e = p + PyUnicode_GET_SIZE(self);
7452 cased = 0;
7453 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007455
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7457 return PyBool_FromLong(0);
7458 else if (!cased && Py_UNICODE_ISUPPER(ch))
7459 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007461 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462}
7463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007464PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007467Return True if S is a titlecased string and there is at least one\n\
7468character in S, i.e. upper- and titlecase characters may only\n\
7469follow uncased characters and lowercase characters only cased ones.\n\
7470Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007473unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474{
7475 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7476 register const Py_UNICODE *e;
7477 int cased, previous_is_cased;
7478
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 /* Shortcut for single character strings */
7480 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7482 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007484 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007485 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007487
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488 e = p + PyUnicode_GET_SIZE(self);
7489 cased = 0;
7490 previous_is_cased = 0;
7491 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007493
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7495 if (previous_is_cased)
7496 return PyBool_FromLong(0);
7497 previous_is_cased = 1;
7498 cased = 1;
7499 }
7500 else if (Py_UNICODE_ISLOWER(ch)) {
7501 if (!previous_is_cased)
7502 return PyBool_FromLong(0);
7503 previous_is_cased = 1;
7504 cased = 1;
7505 }
7506 else
7507 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007509 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510}
7511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007512PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007515Return True if all characters in S are whitespace\n\
7516and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517
7518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007519unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520{
7521 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7522 register const Py_UNICODE *e;
7523
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 /* Shortcut for single character strings */
7525 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 Py_UNICODE_ISSPACE(*p))
7527 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007529 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007530 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007532
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 e = p + PyUnicode_GET_SIZE(self);
7534 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 if (!Py_UNICODE_ISSPACE(*p))
7536 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007538 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539}
7540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007541PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007543\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007544Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007545and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007546
7547static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007548unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007549{
7550 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7551 register const Py_UNICODE *e;
7552
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007553 /* Shortcut for single character strings */
7554 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 Py_UNICODE_ISALPHA(*p))
7556 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007557
7558 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007559 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007561
7562 e = p + PyUnicode_GET_SIZE(self);
7563 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 if (!Py_UNICODE_ISALPHA(*p))
7565 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007566 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007567 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007568}
7569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007572\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007573Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007574and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007575
7576static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007577unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007578{
7579 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7580 register const Py_UNICODE *e;
7581
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007582 /* Shortcut for single character strings */
7583 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 Py_UNICODE_ISALNUM(*p))
7585 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007586
7587 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007588 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007590
7591 e = p + PyUnicode_GET_SIZE(self);
7592 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 if (!Py_UNICODE_ISALNUM(*p))
7594 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007595 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007596 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007597}
7598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007599PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007602Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007603False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
7605static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007606unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607{
7608 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7609 register const Py_UNICODE *e;
7610
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 /* Shortcut for single character strings */
7612 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 Py_UNICODE_ISDECIMAL(*p))
7614 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007616 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007617 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007619
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 e = p + PyUnicode_GET_SIZE(self);
7621 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 if (!Py_UNICODE_ISDECIMAL(*p))
7623 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007625 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626}
7627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007628PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007631Return True if all characters in S are digits\n\
7632and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
7634static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007635unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636{
7637 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7638 register const Py_UNICODE *e;
7639
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 /* Shortcut for single character strings */
7641 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 Py_UNICODE_ISDIGIT(*p))
7643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007645 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007646 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007648
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649 e = p + PyUnicode_GET_SIZE(self);
7650 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 if (!Py_UNICODE_ISDIGIT(*p))
7652 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007654 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655}
7656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007657PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007660Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007661False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
7663static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007664unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665{
7666 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7667 register const Py_UNICODE *e;
7668
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669 /* Shortcut for single character strings */
7670 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 Py_UNICODE_ISNUMERIC(*p))
7672 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007674 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007675 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007677
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678 e = p + PyUnicode_GET_SIZE(self);
7679 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 if (!Py_UNICODE_ISNUMERIC(*p))
7681 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007683 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684}
7685
Martin v. Löwis47383402007-08-15 07:32:56 +00007686int
7687PyUnicode_IsIdentifier(PyObject *self)
7688{
7689 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7690 register const Py_UNICODE *e;
7691
7692 /* Special case for empty strings */
7693 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007695
7696 /* PEP 3131 says that the first character must be in
7697 XID_Start and subsequent characters in XID_Continue,
7698 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007699 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007700 letters, digits, underscore). However, given the current
7701 definition of XID_Start and XID_Continue, it is sufficient
7702 to check just for these, except that _ must be allowed
7703 as starting an identifier. */
7704 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7705 return 0;
7706
7707 e = p + PyUnicode_GET_SIZE(self);
7708 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 if (!_PyUnicode_IsXidContinue(*p))
7710 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007711 }
7712 return 1;
7713}
7714
7715PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007717\n\
7718Return True if S is a valid identifier according\n\
7719to the language definition.");
7720
7721static PyObject*
7722unicode_isidentifier(PyObject *self)
7723{
7724 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7725}
7726
Georg Brandl559e5d72008-06-11 18:37:52 +00007727PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007729\n\
7730Return True if all characters in S are considered\n\
7731printable in repr() or S is empty, False otherwise.");
7732
7733static PyObject*
7734unicode_isprintable(PyObject *self)
7735{
7736 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7737 register const Py_UNICODE *e;
7738
7739 /* Shortcut for single character strings */
7740 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7741 Py_RETURN_TRUE;
7742 }
7743
7744 e = p + PyUnicode_GET_SIZE(self);
7745 for (; p < e; p++) {
7746 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7747 Py_RETURN_FALSE;
7748 }
7749 }
7750 Py_RETURN_TRUE;
7751}
7752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007753PyDoc_STRVAR(join__doc__,
Georg Brandl628e6f92009-10-27 20:24:45 +00007754 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755\n\
7756Return a string which is the concatenation of the strings in the\n\
Georg Brandl628e6f92009-10-27 20:24:45 +00007757iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758
7759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007760unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007762 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763}
7764
Martin v. Löwis18e16552006-02-15 17:27:45 +00007765static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766unicode_length(PyUnicodeObject *self)
7767{
7768 return self->length;
7769}
7770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007771PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007774Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007775done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
7777static PyObject *
7778unicode_ljust(PyUnicodeObject *self, PyObject *args)
7779{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007780 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007781 Py_UNICODE fillchar = ' ';
7782
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007783 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 return NULL;
7785
Tim Peters7a29bd52001-09-12 03:03:31 +00007786 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 Py_INCREF(self);
7788 return (PyObject*) self;
7789 }
7790
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007791 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792}
7793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007794PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798
7799static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007800unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 return fixup(self, fixlower);
7803}
7804
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007805#define LEFTSTRIP 0
7806#define RIGHTSTRIP 1
7807#define BOTHSTRIP 2
7808
7809/* Arrays indexed by above */
7810static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7811
7812#define STRIPNAME(i) (stripformat[i]+3)
7813
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007814/* externally visible for str.strip(unicode) */
7815PyObject *
7816_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7817{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007818 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7819 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7820 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7821 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7822 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007823
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007825
Benjamin Peterson14339b62009-01-31 16:36:08 +00007826 i = 0;
7827 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7829 i++;
7830 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007831 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007832
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 j = len;
7834 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 do {
7836 j--;
7837 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7838 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007840
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 Py_INCREF(self);
7843 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007844 }
7845 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007847}
7848
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849
7850static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007853 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7854 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007855
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856 i = 0;
7857 if (striptype != RIGHTSTRIP) {
7858 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7859 i++;
7860 }
7861 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007862
Benjamin Peterson14339b62009-01-31 16:36:08 +00007863 j = len;
7864 if (striptype != LEFTSTRIP) {
7865 do {
7866 j--;
7867 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7868 j++;
7869 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007870
Benjamin Peterson14339b62009-01-31 16:36:08 +00007871 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7872 Py_INCREF(self);
7873 return (PyObject*)self;
7874 }
7875 else
7876 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877}
7878
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007879
7880static PyObject *
7881do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7882{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007883 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007884
Benjamin Peterson14339b62009-01-31 16:36:08 +00007885 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7886 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007887
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 if (sep != NULL && sep != Py_None) {
7889 if (PyUnicode_Check(sep))
7890 return _PyUnicode_XStrip(self, striptype, sep);
7891 else {
7892 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 "%s arg must be None or str",
7894 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007895 return NULL;
7896 }
7897 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007898
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007900}
7901
7902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007903PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007905\n\
7906Return a copy of the string S with leading and trailing\n\
7907whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007908If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007909
7910static PyObject *
7911unicode_strip(PyUnicodeObject *self, PyObject *args)
7912{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007913 if (PyTuple_GET_SIZE(args) == 0)
7914 return do_strip(self, BOTHSTRIP); /* Common case */
7915 else
7916 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007917}
7918
7919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007920PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007922\n\
7923Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007924If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007925
7926static PyObject *
7927unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7928{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007929 if (PyTuple_GET_SIZE(args) == 0)
7930 return do_strip(self, LEFTSTRIP); /* Common case */
7931 else
7932 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007933}
7934
7935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007936PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007938\n\
7939Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007940If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007941
7942static PyObject *
7943unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7944{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007945 if (PyTuple_GET_SIZE(args) == 0)
7946 return do_strip(self, RIGHTSTRIP); /* Common case */
7947 else
7948 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007949}
7950
7951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007953unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954{
7955 PyUnicodeObject *u;
7956 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007957 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007958 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959
Georg Brandl222de0f2009-04-12 12:01:50 +00007960 if (len < 1) {
7961 Py_INCREF(unicode_empty);
7962 return (PyObject *)unicode_empty;
7963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964
Tim Peters7a29bd52001-09-12 03:03:31 +00007965 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 /* no repeat, return original string */
7967 Py_INCREF(str);
7968 return (PyObject*) str;
7969 }
Tim Peters8f422462000-09-09 06:13:41 +00007970
7971 /* ensure # of chars needed doesn't overflow int and # of bytes
7972 * needed doesn't overflow size_t
7973 */
7974 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007975 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007976 PyErr_SetString(PyExc_OverflowError,
7977 "repeated string is too long");
7978 return NULL;
7979 }
7980 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7981 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7982 PyErr_SetString(PyExc_OverflowError,
7983 "repeated string is too long");
7984 return NULL;
7985 }
7986 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 if (!u)
7988 return NULL;
7989
7990 p = u->str;
7991
Georg Brandl222de0f2009-04-12 12:01:50 +00007992 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007993 Py_UNICODE_FILL(p, str->str[0], len);
7994 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007995 Py_ssize_t done = str->length; /* number of characters copied this far */
7996 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007998 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007999 Py_UNICODE_COPY(p+done, p, n);
8000 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 }
8003
8004 return (PyObject*) u;
8005}
8006
8007PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 PyObject *subobj,
8009 PyObject *replobj,
8010 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
8012 PyObject *self;
8013 PyObject *str1;
8014 PyObject *str2;
8015 PyObject *result;
8016
8017 self = PyUnicode_FromObject(obj);
8018 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 str1 = PyUnicode_FromObject(subobj);
8021 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 Py_DECREF(self);
8023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 }
8025 str2 = PyUnicode_FromObject(replobj);
8026 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 Py_DECREF(self);
8028 Py_DECREF(str1);
8029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 }
Tim Petersced69f82003-09-16 20:30:58 +00008031 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 (PyUnicodeObject *)str1,
8033 (PyUnicodeObject *)str2,
8034 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 Py_DECREF(self);
8036 Py_DECREF(str1);
8037 Py_DECREF(str2);
8038 return result;
8039}
8040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008041PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043\n\
8044Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008045old replaced by new. If the optional argument count is\n\
8046given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047
8048static PyObject*
8049unicode_replace(PyUnicodeObject *self, PyObject *args)
8050{
8051 PyUnicodeObject *str1;
8052 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 PyObject *result;
8055
Martin v. Löwis18e16552006-02-15 17:27:45 +00008056 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 return NULL;
8058 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8059 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008062 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 Py_DECREF(str1);
8064 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066
8067 result = replace(self, str1, str2, maxcount);
8068
8069 Py_DECREF(str1);
8070 Py_DECREF(str2);
8071 return result;
8072}
8073
8074static
8075PyObject *unicode_repr(PyObject *unicode)
8076{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008077 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008078 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008079 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8080 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8081
8082 /* XXX(nnorwitz): rather than over-allocating, it would be
8083 better to choose a different scheme. Perhaps scan the
8084 first N-chars of the string and allocate based on that size.
8085 */
8086 /* Initial allocation is based on the longest-possible unichr
8087 escape.
8088
8089 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8090 unichr, so in this case it's the longest unichr escape. In
8091 narrow (UTF-16) builds this is five chars per source unichr
8092 since there are two unichrs in the surrogate pair, so in narrow
8093 (UTF-16) builds it's not the longest unichr escape.
8094
8095 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8096 so in the narrow (UTF-16) build case it's the longest unichr
8097 escape.
8098 */
8099
Walter Dörwald1ab83302007-05-18 17:15:44 +00008100 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008102#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008104#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008106#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008108 if (repr == NULL)
8109 return NULL;
8110
Walter Dörwald1ab83302007-05-18 17:15:44 +00008111 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008112
8113 /* Add quote */
8114 *p++ = (findchar(s, size, '\'') &&
8115 !findchar(s, size, '"')) ? '"' : '\'';
8116 while (size-- > 0) {
8117 Py_UNICODE ch = *s++;
8118
8119 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008120 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008121 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008122 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008123 continue;
8124 }
8125
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008127 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008128 *p++ = '\\';
8129 *p++ = 't';
8130 }
8131 else if (ch == '\n') {
8132 *p++ = '\\';
8133 *p++ = 'n';
8134 }
8135 else if (ch == '\r') {
8136 *p++ = '\\';
8137 *p++ = 'r';
8138 }
8139
8140 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008141 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008142 *p++ = '\\';
8143 *p++ = 'x';
8144 *p++ = hexdigits[(ch >> 4) & 0x000F];
8145 *p++ = hexdigits[ch & 0x000F];
8146 }
8147
Georg Brandl559e5d72008-06-11 18:37:52 +00008148 /* Copy ASCII characters as-is */
8149 else if (ch < 0x7F) {
8150 *p++ = ch;
8151 }
8152
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008154 else {
8155 Py_UCS4 ucs = ch;
8156
8157#ifndef Py_UNICODE_WIDE
8158 Py_UNICODE ch2 = 0;
8159 /* Get code point from surrogate pair */
8160 if (size > 0) {
8161 ch2 = *s;
8162 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008166 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008167 size--;
8168 }
8169 }
8170#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008171 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008172 (categories Z* and C* except ASCII space)
8173 */
8174 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8175 /* Map 8-bit characters to '\xhh' */
8176 if (ucs <= 0xff) {
8177 *p++ = '\\';
8178 *p++ = 'x';
8179 *p++ = hexdigits[(ch >> 4) & 0x000F];
8180 *p++ = hexdigits[ch & 0x000F];
8181 }
8182 /* Map 21-bit characters to '\U00xxxxxx' */
8183 else if (ucs >= 0x10000) {
8184 *p++ = '\\';
8185 *p++ = 'U';
8186 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8187 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8188 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8189 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8190 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8191 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8192 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8193 *p++ = hexdigits[ucs & 0x0000000F];
8194 }
8195 /* Map 16-bit characters to '\uxxxx' */
8196 else {
8197 *p++ = '\\';
8198 *p++ = 'u';
8199 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8200 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8201 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8202 *p++ = hexdigits[ucs & 0x000F];
8203 }
8204 }
8205 /* Copy characters as-is */
8206 else {
8207 *p++ = ch;
8208#ifndef Py_UNICODE_WIDE
8209 if (ucs >= 0x10000)
8210 *p++ = ch2;
8211#endif
8212 }
8213 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008214 }
8215 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008216 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008217
8218 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008219 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008220 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221}
8222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008223PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225\n\
8226Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008227such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228arguments start and end are interpreted as in slice notation.\n\
8229\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008230Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231
8232static PyObject *
8233unicode_rfind(PyUnicodeObject *self, PyObject *args)
8234{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008235 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008236 Py_ssize_t start;
8237 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
Christian Heimes9cd17752007-11-18 19:35:23 +00008240 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
Thomas Wouters477c8d52006-05-27 19:21:47 +00008243 result = stringlib_rfind_slice(
8244 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8245 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8246 start, end
8247 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248
8249 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008250
Christian Heimes217cfd12007-12-02 14:31:20 +00008251 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252}
8253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008254PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008257Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258
8259static PyObject *
8260unicode_rindex(PyUnicodeObject *self, PyObject *args)
8261{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008262 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008263 Py_ssize_t start;
8264 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008265 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266
Christian Heimes9cd17752007-11-18 19:35:23 +00008267 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269
Thomas Wouters477c8d52006-05-27 19:21:47 +00008270 result = stringlib_rfind_slice(
8271 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8272 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8273 start, end
8274 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275
8276 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008277
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 if (result < 0) {
8279 PyErr_SetString(PyExc_ValueError, "substring not found");
8280 return NULL;
8281 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008282 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283}
8284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008285PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008288Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008289done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290
8291static PyObject *
8292unicode_rjust(PyUnicodeObject *self, PyObject *args)
8293{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008294 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008295 Py_UNICODE fillchar = ' ';
8296
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008297 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 return NULL;
8299
Tim Peters7a29bd52001-09-12 03:03:31 +00008300 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 Py_INCREF(self);
8302 return (PyObject*) self;
8303 }
8304
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008305 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306}
8307
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 PyObject *sep,
8310 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311{
8312 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008313
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 s = PyUnicode_FromObject(s);
8315 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008316 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 if (sep != NULL) {
8318 sep = PyUnicode_FromObject(sep);
8319 if (sep == NULL) {
8320 Py_DECREF(s);
8321 return NULL;
8322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 }
8324
8325 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8326
8327 Py_DECREF(s);
8328 Py_XDECREF(sep);
8329 return result;
8330}
8331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008332PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334\n\
8335Return a list of the words in S, using sep as the\n\
8336delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008337splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008338whitespace string is a separator and empty strings are\n\
8339removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340
8341static PyObject*
8342unicode_split(PyUnicodeObject *self, PyObject *args)
8343{
8344 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008345 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346
Martin v. Löwis18e16552006-02-15 17:27:45 +00008347 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 return NULL;
8349
8350 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
Thomas Wouters477c8d52006-05-27 19:21:47 +00008358PyObject *
8359PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8360{
8361 PyObject* str_obj;
8362 PyObject* sep_obj;
8363 PyObject* out;
8364
8365 str_obj = PyUnicode_FromObject(str_in);
8366 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008368 sep_obj = PyUnicode_FromObject(sep_in);
8369 if (!sep_obj) {
8370 Py_DECREF(str_obj);
8371 return NULL;
8372 }
8373
8374 out = stringlib_partition(
8375 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8376 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8377 );
8378
8379 Py_DECREF(sep_obj);
8380 Py_DECREF(str_obj);
8381
8382 return out;
8383}
8384
8385
8386PyObject *
8387PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8388{
8389 PyObject* str_obj;
8390 PyObject* sep_obj;
8391 PyObject* out;
8392
8393 str_obj = PyUnicode_FromObject(str_in);
8394 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008396 sep_obj = PyUnicode_FromObject(sep_in);
8397 if (!sep_obj) {
8398 Py_DECREF(str_obj);
8399 return NULL;
8400 }
8401
8402 out = stringlib_rpartition(
8403 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8404 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8405 );
8406
8407 Py_DECREF(sep_obj);
8408 Py_DECREF(str_obj);
8409
8410 return out;
8411}
8412
8413PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008415\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008416Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008417the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008418found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008419
8420static PyObject*
8421unicode_partition(PyUnicodeObject *self, PyObject *separator)
8422{
8423 return PyUnicode_Partition((PyObject *)self, separator);
8424}
8425
8426PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti4c81fbb2010-01-25 12:02:24 +00008427 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008428\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008429Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008430the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008431separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008432
8433static PyObject*
8434unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8435{
8436 return PyUnicode_RPartition((PyObject *)self, separator);
8437}
8438
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008439PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 PyObject *sep,
8441 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008442{
8443 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008445 s = PyUnicode_FromObject(s);
8446 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 if (sep != NULL) {
8449 sep = PyUnicode_FromObject(sep);
8450 if (sep == NULL) {
8451 Py_DECREF(s);
8452 return NULL;
8453 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008454 }
8455
8456 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8457
8458 Py_DECREF(s);
8459 Py_XDECREF(sep);
8460 return result;
8461}
8462
8463PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008465\n\
8466Return a list of the words in S, using sep as the\n\
8467delimiter string, starting at the end of the string and\n\
8468working to the front. If maxsplit is given, at most maxsplit\n\
8469splits are done. If sep is not specified, any whitespace string\n\
8470is a separator.");
8471
8472static PyObject*
8473unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8474{
8475 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008476 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008477
Martin v. Löwis18e16552006-02-15 17:27:45 +00008478 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008479 return NULL;
8480
8481 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008483 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008485 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008487}
8488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008489PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491\n\
8492Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008493Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008494is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495
8496static PyObject*
8497unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8498{
Guido van Rossum86662912000-04-11 15:38:46 +00008499 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500
Guido van Rossum86662912000-04-11 15:38:46 +00008501 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 return NULL;
8503
Guido van Rossum86662912000-04-11 15:38:46 +00008504 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505}
8506
8507static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008508PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509{
Walter Dörwald346737f2007-05-31 10:44:43 +00008510 if (PyUnicode_CheckExact(self)) {
8511 Py_INCREF(self);
8512 return self;
8513 } else
8514 /* Subtype -- return genuine unicode string with the same value. */
8515 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8516 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517}
8518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008519PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521\n\
8522Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008523and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524
8525static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008526unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 return fixup(self, fixswapcase);
8529}
8530
Georg Brandlceee0772007-11-27 23:48:05 +00008531PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008533\n\
8534Return a translation table usable for str.translate().\n\
8535If there is only one argument, it must be a dictionary mapping Unicode\n\
8536ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008537Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008538If there are two arguments, they must be strings of equal length, and\n\
8539in the resulting dictionary, each character in x will be mapped to the\n\
8540character at the same position in y. If there is a third argument, it\n\
8541must be a string, whose characters will be mapped to None in the result.");
8542
8543static PyObject*
8544unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8545{
8546 PyObject *x, *y = NULL, *z = NULL;
8547 PyObject *new = NULL, *key, *value;
8548 Py_ssize_t i = 0;
8549 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008550
Georg Brandlceee0772007-11-27 23:48:05 +00008551 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8552 return NULL;
8553 new = PyDict_New();
8554 if (!new)
8555 return NULL;
8556 if (y != NULL) {
8557 /* x must be a string too, of equal length */
8558 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8559 if (!PyUnicode_Check(x)) {
8560 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8561 "be a string if there is a second argument");
8562 goto err;
8563 }
8564 if (PyUnicode_GET_SIZE(x) != ylen) {
8565 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8566 "arguments must have equal length");
8567 goto err;
8568 }
8569 /* create entries for translating chars in x to those in y */
8570 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008571 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8572 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008573 if (!key || !value)
8574 goto err;
8575 res = PyDict_SetItem(new, key, value);
8576 Py_DECREF(key);
8577 Py_DECREF(value);
8578 if (res < 0)
8579 goto err;
8580 }
8581 /* create entries for deleting chars in z */
8582 if (z != NULL) {
8583 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008584 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008585 if (!key)
8586 goto err;
8587 res = PyDict_SetItem(new, key, Py_None);
8588 Py_DECREF(key);
8589 if (res < 0)
8590 goto err;
8591 }
8592 }
8593 } else {
8594 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008595 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008596 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8597 "to maketrans it must be a dict");
8598 goto err;
8599 }
8600 /* copy entries into the new dict, converting string keys to int keys */
8601 while (PyDict_Next(x, &i, &key, &value)) {
8602 if (PyUnicode_Check(key)) {
8603 /* convert string keys to integer keys */
8604 PyObject *newkey;
8605 if (PyUnicode_GET_SIZE(key) != 1) {
8606 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8607 "table must be of length 1");
8608 goto err;
8609 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008610 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008611 if (!newkey)
8612 goto err;
8613 res = PyDict_SetItem(new, newkey, value);
8614 Py_DECREF(newkey);
8615 if (res < 0)
8616 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008617 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008618 /* just keep integer keys */
8619 if (PyDict_SetItem(new, key, value) < 0)
8620 goto err;
8621 } else {
8622 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8623 "be strings or integers");
8624 goto err;
8625 }
8626 }
8627 }
8628 return new;
8629 err:
8630 Py_DECREF(new);
8631 return NULL;
8632}
8633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008634PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636\n\
8637Return a copy of the string S, where all characters have been mapped\n\
8638through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008639Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008640Unmapped characters are left untouched. Characters mapped to None\n\
8641are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642
8643static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008644unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645{
Georg Brandlceee0772007-11-27 23:48:05 +00008646 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647}
8648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008649PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008652Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653
8654static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008655unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 return fixup(self, fixupper);
8658}
8659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008660PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008663Pad a numeric string S with zeros on the left, to fill a field\n\
8664of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665
8666static PyObject *
8667unicode_zfill(PyUnicodeObject *self, PyObject *args)
8668{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008669 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 PyUnicodeObject *u;
8671
Martin v. Löwis18e16552006-02-15 17:27:45 +00008672 Py_ssize_t width;
8673 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 return NULL;
8675
8676 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008677 if (PyUnicode_CheckExact(self)) {
8678 Py_INCREF(self);
8679 return (PyObject*) self;
8680 }
8681 else
8682 return PyUnicode_FromUnicode(
8683 PyUnicode_AS_UNICODE(self),
8684 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 }
8687
8688 fill = width - self->length;
8689
8690 u = pad(self, fill, 0, '0');
8691
Walter Dörwald068325e2002-04-15 13:36:47 +00008692 if (u == NULL)
8693 return NULL;
8694
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 if (u->str[fill] == '+' || u->str[fill] == '-') {
8696 /* move sign to beginning of string */
8697 u->str[0] = u->str[fill];
8698 u->str[fill] = '0';
8699 }
8700
8701 return (PyObject*) u;
8702}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
8704#if 0
8705static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008706unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707{
Christian Heimes2202f872008-02-06 14:31:34 +00008708 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709}
8710#endif
8711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008712PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008715Return True if S starts with the specified prefix, False otherwise.\n\
8716With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008717With optional end, stop comparing S at that position.\n\
8718prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
8720static PyObject *
8721unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008724 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008726 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008727 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008728 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008730 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8732 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008733 if (PyTuple_Check(subobj)) {
8734 Py_ssize_t i;
8735 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8736 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008738 if (substring == NULL)
8739 return NULL;
8740 result = tailmatch(self, substring, start, end, -1);
8741 Py_DECREF(substring);
8742 if (result) {
8743 Py_RETURN_TRUE;
8744 }
8745 }
8746 /* nothing matched */
8747 Py_RETURN_FALSE;
8748 }
8749 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008752 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008754 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755}
8756
8757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008758PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008761Return True if S ends with the specified suffix, False otherwise.\n\
8762With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008763With optional end, stop comparing S at that position.\n\
8764suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765
8766static PyObject *
8767unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008770 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008772 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008773 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008774 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008776 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8778 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008779 if (PyTuple_Check(subobj)) {
8780 Py_ssize_t i;
8781 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8782 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008784 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008786 result = tailmatch(self, substring, start, end, +1);
8787 Py_DECREF(substring);
8788 if (result) {
8789 Py_RETURN_TRUE;
8790 }
8791 }
8792 Py_RETURN_FALSE;
8793 }
8794 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008798 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008800 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801}
8802
Eric Smith8c663262007-08-25 02:26:07 +00008803#include "stringlib/string_format.h"
8804
8805PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008807\n\
8808");
8809
Eric Smith4a7d76d2008-05-30 18:10:19 +00008810static PyObject *
8811unicode__format__(PyObject* self, PyObject* args)
8812{
8813 PyObject *format_spec;
8814
8815 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8816 return NULL;
8817
8818 return _PyUnicode_FormatAdvanced(self,
8819 PyUnicode_AS_UNICODE(format_spec),
8820 PyUnicode_GET_SIZE(format_spec));
8821}
8822
Eric Smith8c663262007-08-25 02:26:07 +00008823PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008825\n\
8826");
8827
8828static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008829unicode__sizeof__(PyUnicodeObject *v)
8830{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008831 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8832 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008833}
8834
8835PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008837
8838static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008839unicode_getnewargs(PyUnicodeObject *v)
8840{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008841 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008842}
8843
8844
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845static PyMethodDef unicode_methods[] = {
8846
8847 /* Order is according to common usage: often used methods should
8848 appear first, since lookup is done sequentially. */
8849
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008850 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8851 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8852 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008853 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008854 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8855 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8856 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8857 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8858 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8859 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8860 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008861 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008862 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8863 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8864 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008865 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008866 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8867 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8868 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008869 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008870 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008871 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008872 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008873 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8874 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8875 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8876 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8877 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8878 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8879 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8880 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8881 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8882 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8883 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8884 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8885 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8886 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008887 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008888 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008889 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008890 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008891 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008892 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8893 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008894 {"maketrans", (PyCFunction) unicode_maketrans,
8895 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008896 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008897#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008898 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899#endif
8900
8901#if 0
8902 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008903 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904#endif
8905
Benjamin Peterson14339b62009-01-31 16:36:08 +00008906 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 {NULL, NULL}
8908};
8909
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008910static PyObject *
8911unicode_mod(PyObject *v, PyObject *w)
8912{
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 if (!PyUnicode_Check(v)) {
8914 Py_INCREF(Py_NotImplemented);
8915 return Py_NotImplemented;
8916 }
8917 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008918}
8919
8920static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008921 0, /*nb_add*/
8922 0, /*nb_subtract*/
8923 0, /*nb_multiply*/
8924 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008925};
8926
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008928 (lenfunc) unicode_length, /* sq_length */
8929 PyUnicode_Concat, /* sq_concat */
8930 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8931 (ssizeargfunc) unicode_getitem, /* sq_item */
8932 0, /* sq_slice */
8933 0, /* sq_ass_item */
8934 0, /* sq_ass_slice */
8935 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936};
8937
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008938static PyObject*
8939unicode_subscript(PyUnicodeObject* self, PyObject* item)
8940{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008941 if (PyIndex_Check(item)) {
8942 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008943 if (i == -1 && PyErr_Occurred())
8944 return NULL;
8945 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008946 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008947 return unicode_getitem(self, i);
8948 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008949 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008950 Py_UNICODE* source_buf;
8951 Py_UNICODE* result_buf;
8952 PyObject* result;
8953
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008954 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008956 return NULL;
8957 }
8958
8959 if (slicelength <= 0) {
8960 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008961 } else if (start == 0 && step == 1 && slicelength == self->length &&
8962 PyUnicode_CheckExact(self)) {
8963 Py_INCREF(self);
8964 return (PyObject *)self;
8965 } else if (step == 1) {
8966 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008967 } else {
8968 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008969 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8970 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008971
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 if (result_buf == NULL)
8973 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008974
8975 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8976 result_buf[i] = source_buf[cur];
8977 }
Tim Petersced69f82003-09-16 20:30:58 +00008978
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008979 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008980 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008981 return result;
8982 }
8983 } else {
8984 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8985 return NULL;
8986 }
8987}
8988
8989static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008990 (lenfunc)unicode_length, /* mp_length */
8991 (binaryfunc)unicode_subscript, /* mp_subscript */
8992 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008993};
8994
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996/* Helpers for PyUnicode_Format() */
8997
8998static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008999getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009001 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 (*p_argidx)++;
9004 if (arglen < 0)
9005 return args;
9006 else
9007 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 }
9009 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 return NULL;
9012}
9013
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009014/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009016static PyObject *
9017formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009019 char *p;
9020 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009022
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 x = PyFloat_AsDouble(v);
9024 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009025 return NULL;
9026
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009029
Eric Smith0923d1d2009-04-16 20:16:10 +00009030 p = PyOS_double_to_string(x, type, prec,
9031 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009032 if (p == NULL)
9033 return NULL;
9034 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009035 PyMem_Free(p);
9036 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037}
9038
Tim Peters38fd5b62000-09-21 05:43:11 +00009039static PyObject*
9040formatlong(PyObject *val, int flags, int prec, int type)
9041{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009042 char *buf;
9043 int len;
9044 PyObject *str; /* temporary string object. */
9045 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009046
Benjamin Peterson14339b62009-01-31 16:36:08 +00009047 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9048 if (!str)
9049 return NULL;
9050 result = PyUnicode_FromStringAndSize(buf, len);
9051 Py_DECREF(str);
9052 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009053}
9054
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055static int
9056formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009057 size_t buflen,
9058 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009060 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009061 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 if (PyUnicode_GET_SIZE(v) == 1) {
9063 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9064 buf[1] = '\0';
9065 return 1;
9066 }
9067#ifndef Py_UNICODE_WIDE
9068 if (PyUnicode_GET_SIZE(v) == 2) {
9069 /* Decode a valid surrogate pair */
9070 int c0 = PyUnicode_AS_UNICODE(v)[0];
9071 int c1 = PyUnicode_AS_UNICODE(v)[1];
9072 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9073 0xDC00 <= c1 && c1 <= 0xDFFF) {
9074 buf[0] = c0;
9075 buf[1] = c1;
9076 buf[2] = '\0';
9077 return 2;
9078 }
9079 }
9080#endif
9081 goto onError;
9082 }
9083 else {
9084 /* Integer input truncated to a character */
9085 long x;
9086 x = PyLong_AsLong(v);
9087 if (x == -1 && PyErr_Occurred())
9088 goto onError;
9089
9090 if (x < 0 || x > 0x10ffff) {
9091 PyErr_SetString(PyExc_OverflowError,
9092 "%c arg not in range(0x110000)");
9093 return -1;
9094 }
9095
9096#ifndef Py_UNICODE_WIDE
9097 if (x > 0xffff) {
9098 x -= 0x10000;
9099 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9100 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9101 return 2;
9102 }
9103#endif
9104 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009105 buf[1] = '\0';
9106 return 1;
9107 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009108
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009110 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009112 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113}
9114
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009115/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009116 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009117*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009118#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009119
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122{
9123 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009124 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 int args_owned = 0;
9126 PyUnicodeObject *result = NULL;
9127 PyObject *dict = NULL;
9128 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009129
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 PyErr_BadInternalCall();
9132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 }
9134 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009135 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 fmt = PyUnicode_AS_UNICODE(uformat);
9138 fmtcnt = PyUnicode_GET_SIZE(uformat);
9139
9140 reslen = rescnt = fmtcnt + 100;
9141 result = _PyUnicode_New(reslen);
9142 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 res = PyUnicode_AS_UNICODE(result);
9145
9146 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 arglen = PyTuple_Size(args);
9148 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 }
9150 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 arglen = -1;
9152 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009154 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009155 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157
9158 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 if (*fmt != '%') {
9160 if (--rescnt < 0) {
9161 rescnt = fmtcnt + 100;
9162 reslen += rescnt;
9163 if (_PyUnicode_Resize(&result, reslen) < 0)
9164 goto onError;
9165 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9166 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009169 }
9170 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 /* Got a format specifier */
9172 int flags = 0;
9173 Py_ssize_t width = -1;
9174 int prec = -1;
9175 Py_UNICODE c = '\0';
9176 Py_UNICODE fill;
9177 int isnumok;
9178 PyObject *v = NULL;
9179 PyObject *temp = NULL;
9180 Py_UNICODE *pbuf;
9181 Py_UNICODE sign;
9182 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009183 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 fmt++;
9186 if (*fmt == '(') {
9187 Py_UNICODE *keystart;
9188 Py_ssize_t keylen;
9189 PyObject *key;
9190 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009191
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 if (dict == NULL) {
9193 PyErr_SetString(PyExc_TypeError,
9194 "format requires a mapping");
9195 goto onError;
9196 }
9197 ++fmt;
9198 --fmtcnt;
9199 keystart = fmt;
9200 /* Skip over balanced parentheses */
9201 while (pcount > 0 && --fmtcnt >= 0) {
9202 if (*fmt == ')')
9203 --pcount;
9204 else if (*fmt == '(')
9205 ++pcount;
9206 fmt++;
9207 }
9208 keylen = fmt - keystart - 1;
9209 if (fmtcnt < 0 || pcount > 0) {
9210 PyErr_SetString(PyExc_ValueError,
9211 "incomplete format key");
9212 goto onError;
9213 }
9214#if 0
9215 /* keys are converted to strings using UTF-8 and
9216 then looked up since Python uses strings to hold
9217 variables names etc. in its namespaces and we
9218 wouldn't want to break common idioms. */
9219 key = PyUnicode_EncodeUTF8(keystart,
9220 keylen,
9221 NULL);
9222#else
9223 key = PyUnicode_FromUnicode(keystart, keylen);
9224#endif
9225 if (key == NULL)
9226 goto onError;
9227 if (args_owned) {
9228 Py_DECREF(args);
9229 args_owned = 0;
9230 }
9231 args = PyObject_GetItem(dict, key);
9232 Py_DECREF(key);
9233 if (args == NULL) {
9234 goto onError;
9235 }
9236 args_owned = 1;
9237 arglen = -1;
9238 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 while (--fmtcnt >= 0) {
9241 switch (c = *fmt++) {
9242 case '-': flags |= F_LJUST; continue;
9243 case '+': flags |= F_SIGN; continue;
9244 case ' ': flags |= F_BLANK; continue;
9245 case '#': flags |= F_ALT; continue;
9246 case '0': flags |= F_ZERO; continue;
9247 }
9248 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009250 if (c == '*') {
9251 v = getnextarg(args, arglen, &argidx);
9252 if (v == NULL)
9253 goto onError;
9254 if (!PyLong_Check(v)) {
9255 PyErr_SetString(PyExc_TypeError,
9256 "* wants int");
9257 goto onError;
9258 }
9259 width = PyLong_AsLong(v);
9260 if (width == -1 && PyErr_Occurred())
9261 goto onError;
9262 if (width < 0) {
9263 flags |= F_LJUST;
9264 width = -width;
9265 }
9266 if (--fmtcnt >= 0)
9267 c = *fmt++;
9268 }
9269 else if (c >= '0' && c <= '9') {
9270 width = c - '0';
9271 while (--fmtcnt >= 0) {
9272 c = *fmt++;
9273 if (c < '0' || c > '9')
9274 break;
9275 if ((width*10) / 10 != width) {
9276 PyErr_SetString(PyExc_ValueError,
9277 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009278 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009279 }
9280 width = width*10 + (c - '0');
9281 }
9282 }
9283 if (c == '.') {
9284 prec = 0;
9285 if (--fmtcnt >= 0)
9286 c = *fmt++;
9287 if (c == '*') {
9288 v = getnextarg(args, arglen, &argidx);
9289 if (v == NULL)
9290 goto onError;
9291 if (!PyLong_Check(v)) {
9292 PyErr_SetString(PyExc_TypeError,
9293 "* wants int");
9294 goto onError;
9295 }
9296 prec = PyLong_AsLong(v);
9297 if (prec == -1 && PyErr_Occurred())
9298 goto onError;
9299 if (prec < 0)
9300 prec = 0;
9301 if (--fmtcnt >= 0)
9302 c = *fmt++;
9303 }
9304 else if (c >= '0' && c <= '9') {
9305 prec = c - '0';
9306 while (--fmtcnt >= 0) {
9307 c = Py_CHARMASK(*fmt++);
9308 if (c < '0' || c > '9')
9309 break;
9310 if ((prec*10) / 10 != prec) {
9311 PyErr_SetString(PyExc_ValueError,
9312 "prec too big");
9313 goto onError;
9314 }
9315 prec = prec*10 + (c - '0');
9316 }
9317 }
9318 } /* prec */
9319 if (fmtcnt >= 0) {
9320 if (c == 'h' || c == 'l' || c == 'L') {
9321 if (--fmtcnt >= 0)
9322 c = *fmt++;
9323 }
9324 }
9325 if (fmtcnt < 0) {
9326 PyErr_SetString(PyExc_ValueError,
9327 "incomplete format");
9328 goto onError;
9329 }
9330 if (c != '%') {
9331 v = getnextarg(args, arglen, &argidx);
9332 if (v == NULL)
9333 goto onError;
9334 }
9335 sign = 0;
9336 fill = ' ';
9337 switch (c) {
9338
9339 case '%':
9340 pbuf = formatbuf;
9341 /* presume that buffer length is at least 1 */
9342 pbuf[0] = '%';
9343 len = 1;
9344 break;
9345
9346 case 's':
9347 case 'r':
9348 case 'a':
Victor Stinnerabdb21a2010-03-22 12:53:14 +00009349 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 temp = v;
9351 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009352 }
9353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 if (c == 's')
9355 temp = PyObject_Str(v);
9356 else if (c == 'r')
9357 temp = PyObject_Repr(v);
9358 else
9359 temp = PyObject_ASCII(v);
9360 if (temp == NULL)
9361 goto onError;
9362 if (PyUnicode_Check(temp))
9363 /* nothing to do */;
9364 else {
9365 Py_DECREF(temp);
9366 PyErr_SetString(PyExc_TypeError,
9367 "%s argument has non-string str()");
9368 goto onError;
9369 }
9370 }
9371 pbuf = PyUnicode_AS_UNICODE(temp);
9372 len = PyUnicode_GET_SIZE(temp);
9373 if (prec >= 0 && len > prec)
9374 len = prec;
9375 break;
9376
9377 case 'i':
9378 case 'd':
9379 case 'u':
9380 case 'o':
9381 case 'x':
9382 case 'X':
9383 if (c == 'i')
9384 c = 'd';
9385 isnumok = 0;
9386 if (PyNumber_Check(v)) {
9387 PyObject *iobj=NULL;
9388
9389 if (PyLong_Check(v)) {
9390 iobj = v;
9391 Py_INCREF(iobj);
9392 }
9393 else {
9394 iobj = PyNumber_Long(v);
9395 }
9396 if (iobj!=NULL) {
9397 if (PyLong_Check(iobj)) {
9398 isnumok = 1;
9399 temp = formatlong(iobj, flags, prec, c);
9400 Py_DECREF(iobj);
9401 if (!temp)
9402 goto onError;
9403 pbuf = PyUnicode_AS_UNICODE(temp);
9404 len = PyUnicode_GET_SIZE(temp);
9405 sign = 1;
9406 }
9407 else {
9408 Py_DECREF(iobj);
9409 }
9410 }
9411 }
9412 if (!isnumok) {
9413 PyErr_Format(PyExc_TypeError,
9414 "%%%c format: a number is required, "
9415 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9416 goto onError;
9417 }
9418 if (flags & F_ZERO)
9419 fill = '0';
9420 break;
9421
9422 case 'e':
9423 case 'E':
9424 case 'f':
9425 case 'F':
9426 case 'g':
9427 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009428 temp = formatfloat(v, flags, prec, c);
9429 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009430 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009431 pbuf = PyUnicode_AS_UNICODE(temp);
9432 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 sign = 1;
9434 if (flags & F_ZERO)
9435 fill = '0';
9436 break;
9437
9438 case 'c':
9439 pbuf = formatbuf;
9440 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9441 if (len < 0)
9442 goto onError;
9443 break;
9444
9445 default:
9446 PyErr_Format(PyExc_ValueError,
9447 "unsupported format character '%c' (0x%x) "
9448 "at index %zd",
9449 (31<=c && c<=126) ? (char)c : '?',
9450 (int)c,
9451 (Py_ssize_t)(fmt - 1 -
9452 PyUnicode_AS_UNICODE(uformat)));
9453 goto onError;
9454 }
9455 if (sign) {
9456 if (*pbuf == '-' || *pbuf == '+') {
9457 sign = *pbuf++;
9458 len--;
9459 }
9460 else if (flags & F_SIGN)
9461 sign = '+';
9462 else if (flags & F_BLANK)
9463 sign = ' ';
9464 else
9465 sign = 0;
9466 }
9467 if (width < len)
9468 width = len;
9469 if (rescnt - (sign != 0) < width) {
9470 reslen -= rescnt;
9471 rescnt = width + fmtcnt + 100;
9472 reslen += rescnt;
9473 if (reslen < 0) {
9474 Py_XDECREF(temp);
9475 PyErr_NoMemory();
9476 goto onError;
9477 }
9478 if (_PyUnicode_Resize(&result, reslen) < 0) {
9479 Py_XDECREF(temp);
9480 goto onError;
9481 }
9482 res = PyUnicode_AS_UNICODE(result)
9483 + reslen - rescnt;
9484 }
9485 if (sign) {
9486 if (fill != ' ')
9487 *res++ = sign;
9488 rescnt--;
9489 if (width > len)
9490 width--;
9491 }
9492 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9493 assert(pbuf[0] == '0');
9494 assert(pbuf[1] == c);
9495 if (fill != ' ') {
9496 *res++ = *pbuf++;
9497 *res++ = *pbuf++;
9498 }
9499 rescnt -= 2;
9500 width -= 2;
9501 if (width < 0)
9502 width = 0;
9503 len -= 2;
9504 }
9505 if (width > len && !(flags & F_LJUST)) {
9506 do {
9507 --rescnt;
9508 *res++ = fill;
9509 } while (--width > len);
9510 }
9511 if (fill == ' ') {
9512 if (sign)
9513 *res++ = sign;
9514 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9515 assert(pbuf[0] == '0');
9516 assert(pbuf[1] == c);
9517 *res++ = *pbuf++;
9518 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009519 }
9520 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 Py_UNICODE_COPY(res, pbuf, len);
9522 res += len;
9523 rescnt -= len;
9524 while (--width >= len) {
9525 --rescnt;
9526 *res++ = ' ';
9527 }
9528 if (dict && (argidx < arglen) && c != '%') {
9529 PyErr_SetString(PyExc_TypeError,
9530 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009531 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 goto onError;
9533 }
9534 Py_XDECREF(temp);
9535 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536 } /* until end */
9537 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 PyErr_SetString(PyExc_TypeError,
9539 "not all arguments converted during string formatting");
9540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 }
9542
Thomas Woutersa96affe2006-03-12 00:29:36 +00009543 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 }
9548 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549 return (PyObject *)result;
9550
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 Py_XDECREF(result);
9553 Py_DECREF(uformat);
9554 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 }
9557 return NULL;
9558}
9559
Jeremy Hylton938ace62002-07-17 16:30:39 +00009560static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009561unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9562
Tim Peters6d6c1a32001-08-02 04:15:00 +00009563static PyObject *
9564unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9565{
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009567 static char *kwlist[] = {"object", "encoding", "errors", 0};
9568 char *encoding = NULL;
9569 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009570
Benjamin Peterson14339b62009-01-31 16:36:08 +00009571 if (type != &PyUnicode_Type)
9572 return unicode_subtype_new(type, args, kwds);
9573 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 return NULL;
9576 if (x == NULL)
9577 return (PyObject *)_PyUnicode_New(0);
9578 if (encoding == NULL && errors == NULL)
9579 return PyObject_Str(x);
9580 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009582}
9583
Guido van Rossume023fe02001-08-30 03:12:59 +00009584static PyObject *
9585unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9586{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009587 PyUnicodeObject *tmp, *pnew;
9588 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009589
Benjamin Peterson14339b62009-01-31 16:36:08 +00009590 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9591 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9592 if (tmp == NULL)
9593 return NULL;
9594 assert(PyUnicode_Check(tmp));
9595 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9596 if (pnew == NULL) {
9597 Py_DECREF(tmp);
9598 return NULL;
9599 }
9600 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9601 if (pnew->str == NULL) {
9602 _Py_ForgetReference((PyObject *)pnew);
9603 PyObject_Del(pnew);
9604 Py_DECREF(tmp);
9605 return PyErr_NoMemory();
9606 }
9607 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9608 pnew->length = n;
9609 pnew->hash = tmp->hash;
9610 Py_DECREF(tmp);
9611 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009612}
9613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009614PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009616\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009617Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009618encoding defaults to the current default string encoding.\n\
9619errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009620
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009621static PyObject *unicode_iter(PyObject *seq);
9622
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009624 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009625 "str", /* tp_name */
9626 sizeof(PyUnicodeObject), /* tp_size */
9627 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009629 (destructor)unicode_dealloc, /* tp_dealloc */
9630 0, /* tp_print */
9631 0, /* tp_getattr */
9632 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009633 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009634 unicode_repr, /* tp_repr */
9635 &unicode_as_number, /* tp_as_number */
9636 &unicode_as_sequence, /* tp_as_sequence */
9637 &unicode_as_mapping, /* tp_as_mapping */
9638 (hashfunc) unicode_hash, /* tp_hash*/
9639 0, /* tp_call*/
9640 (reprfunc) unicode_str, /* tp_str */
9641 PyObject_GenericGetAttr, /* tp_getattro */
9642 0, /* tp_setattro */
9643 0, /* tp_as_buffer */
9644 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009645 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009646 unicode_doc, /* tp_doc */
9647 0, /* tp_traverse */
9648 0, /* tp_clear */
9649 PyUnicode_RichCompare, /* tp_richcompare */
9650 0, /* tp_weaklistoffset */
9651 unicode_iter, /* tp_iter */
9652 0, /* tp_iternext */
9653 unicode_methods, /* tp_methods */
9654 0, /* tp_members */
9655 0, /* tp_getset */
9656 &PyBaseObject_Type, /* tp_base */
9657 0, /* tp_dict */
9658 0, /* tp_descr_get */
9659 0, /* tp_descr_set */
9660 0, /* tp_dictoffset */
9661 0, /* tp_init */
9662 0, /* tp_alloc */
9663 unicode_new, /* tp_new */
9664 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665};
9666
9667/* Initialize the Unicode implementation */
9668
Thomas Wouters78890102000-07-22 19:25:51 +00009669void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009671 int i;
9672
Thomas Wouters477c8d52006-05-27 19:21:47 +00009673 /* XXX - move this array to unicodectype.c ? */
9674 Py_UNICODE linebreak[] = {
9675 0x000A, /* LINE FEED */
9676 0x000D, /* CARRIAGE RETURN */
9677 0x001C, /* FILE SEPARATOR */
9678 0x001D, /* GROUP SEPARATOR */
9679 0x001E, /* RECORD SEPARATOR */
9680 0x0085, /* NEXT LINE */
9681 0x2028, /* LINE SEPARATOR */
9682 0x2029, /* PARAGRAPH SEPARATOR */
9683 };
9684
Fred Drakee4315f52000-05-09 19:53:39 +00009685 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009686 free_list = NULL;
9687 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009689 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009690 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009691
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009692 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009694 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009696
9697 /* initialize the linebreak bloom filter */
9698 bloom_linebreak = make_bloom_mask(
9699 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9700 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009701
9702 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703}
9704
9705/* Finalize the Unicode implementation */
9706
Christian Heimesa156e092008-02-16 07:38:31 +00009707int
9708PyUnicode_ClearFreeList(void)
9709{
9710 int freelist_size = numfree;
9711 PyUnicodeObject *u;
9712
9713 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 PyUnicodeObject *v = u;
9715 u = *(PyUnicodeObject **)u;
9716 if (v->str)
9717 PyObject_DEL(v->str);
9718 Py_XDECREF(v->defenc);
9719 PyObject_Del(v);
9720 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009721 }
9722 free_list = NULL;
9723 assert(numfree == 0);
9724 return freelist_size;
9725}
9726
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727void
Thomas Wouters78890102000-07-22 19:25:51 +00009728_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009730 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009732 Py_XDECREF(unicode_empty);
9733 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009734
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009735 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 if (unicode_latin1[i]) {
9737 Py_DECREF(unicode_latin1[i]);
9738 unicode_latin1[i] = NULL;
9739 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009740 }
Christian Heimesa156e092008-02-16 07:38:31 +00009741 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009743
Walter Dörwald16807132007-05-25 13:52:07 +00009744void
9745PyUnicode_InternInPlace(PyObject **p)
9746{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009747 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9748 PyObject *t;
9749 if (s == NULL || !PyUnicode_Check(s))
9750 Py_FatalError(
9751 "PyUnicode_InternInPlace: unicode strings only please!");
9752 /* If it's a subclass, we don't really know what putting
9753 it in the interned dict might do. */
9754 if (!PyUnicode_CheckExact(s))
9755 return;
9756 if (PyUnicode_CHECK_INTERNED(s))
9757 return;
9758 if (interned == NULL) {
9759 interned = PyDict_New();
9760 if (interned == NULL) {
9761 PyErr_Clear(); /* Don't leave an exception */
9762 return;
9763 }
9764 }
9765 /* It might be that the GetItem call fails even
9766 though the key is present in the dictionary,
9767 namely when this happens during a stack overflow. */
9768 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009770 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009771
Benjamin Peterson29060642009-01-31 22:14:21 +00009772 if (t) {
9773 Py_INCREF(t);
9774 Py_DECREF(*p);
9775 *p = t;
9776 return;
9777 }
Walter Dörwald16807132007-05-25 13:52:07 +00009778
Benjamin Peterson14339b62009-01-31 16:36:08 +00009779 PyThreadState_GET()->recursion_critical = 1;
9780 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9781 PyErr_Clear();
9782 PyThreadState_GET()->recursion_critical = 0;
9783 return;
9784 }
9785 PyThreadState_GET()->recursion_critical = 0;
9786 /* The two references in interned are not counted by refcnt.
9787 The deallocator will take care of this */
9788 Py_REFCNT(s) -= 2;
9789 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009790}
9791
9792void
9793PyUnicode_InternImmortal(PyObject **p)
9794{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009795 PyUnicode_InternInPlace(p);
9796 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9797 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9798 Py_INCREF(*p);
9799 }
Walter Dörwald16807132007-05-25 13:52:07 +00009800}
9801
9802PyObject *
9803PyUnicode_InternFromString(const char *cp)
9804{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009805 PyObject *s = PyUnicode_FromString(cp);
9806 if (s == NULL)
9807 return NULL;
9808 PyUnicode_InternInPlace(&s);
9809 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009810}
9811
9812void _Py_ReleaseInternedUnicodeStrings(void)
9813{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009814 PyObject *keys;
9815 PyUnicodeObject *s;
9816 Py_ssize_t i, n;
9817 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009818
Benjamin Peterson14339b62009-01-31 16:36:08 +00009819 if (interned == NULL || !PyDict_Check(interned))
9820 return;
9821 keys = PyDict_Keys(interned);
9822 if (keys == NULL || !PyList_Check(keys)) {
9823 PyErr_Clear();
9824 return;
9825 }
Walter Dörwald16807132007-05-25 13:52:07 +00009826
Benjamin Peterson14339b62009-01-31 16:36:08 +00009827 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9828 detector, interned unicode strings are not forcibly deallocated;
9829 rather, we give them their stolen references back, and then clear
9830 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009831
Benjamin Peterson14339b62009-01-31 16:36:08 +00009832 n = PyList_GET_SIZE(keys);
9833 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009834 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009835 for (i = 0; i < n; i++) {
9836 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9837 switch (s->state) {
9838 case SSTATE_NOT_INTERNED:
9839 /* XXX Shouldn't happen */
9840 break;
9841 case SSTATE_INTERNED_IMMORTAL:
9842 Py_REFCNT(s) += 1;
9843 immortal_size += s->length;
9844 break;
9845 case SSTATE_INTERNED_MORTAL:
9846 Py_REFCNT(s) += 2;
9847 mortal_size += s->length;
9848 break;
9849 default:
9850 Py_FatalError("Inconsistent interned string state.");
9851 }
9852 s->state = SSTATE_NOT_INTERNED;
9853 }
9854 fprintf(stderr, "total size of all interned strings: "
9855 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9856 "mortal/immortal\n", mortal_size, immortal_size);
9857 Py_DECREF(keys);
9858 PyDict_Clear(interned);
9859 Py_DECREF(interned);
9860 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009861}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009862
9863
9864/********************* Unicode Iterator **************************/
9865
9866typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009867 PyObject_HEAD
9868 Py_ssize_t it_index;
9869 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009870} unicodeiterobject;
9871
9872static void
9873unicodeiter_dealloc(unicodeiterobject *it)
9874{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009875 _PyObject_GC_UNTRACK(it);
9876 Py_XDECREF(it->it_seq);
9877 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009878}
9879
9880static int
9881unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9882{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009883 Py_VISIT(it->it_seq);
9884 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009885}
9886
9887static PyObject *
9888unicodeiter_next(unicodeiterobject *it)
9889{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009890 PyUnicodeObject *seq;
9891 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009892
Benjamin Peterson14339b62009-01-31 16:36:08 +00009893 assert(it != NULL);
9894 seq = it->it_seq;
9895 if (seq == NULL)
9896 return NULL;
9897 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009898
Benjamin Peterson14339b62009-01-31 16:36:08 +00009899 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9900 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009901 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009902 if (item != NULL)
9903 ++it->it_index;
9904 return item;
9905 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009906
Benjamin Peterson14339b62009-01-31 16:36:08 +00009907 Py_DECREF(seq);
9908 it->it_seq = NULL;
9909 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009910}
9911
9912static PyObject *
9913unicodeiter_len(unicodeiterobject *it)
9914{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009915 Py_ssize_t len = 0;
9916 if (it->it_seq)
9917 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9918 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009919}
9920
9921PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9922
9923static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009927};
9928
9929PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009930 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9931 "str_iterator", /* tp_name */
9932 sizeof(unicodeiterobject), /* tp_basicsize */
9933 0, /* tp_itemsize */
9934 /* methods */
9935 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9936 0, /* tp_print */
9937 0, /* tp_getattr */
9938 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009939 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009940 0, /* tp_repr */
9941 0, /* tp_as_number */
9942 0, /* tp_as_sequence */
9943 0, /* tp_as_mapping */
9944 0, /* tp_hash */
9945 0, /* tp_call */
9946 0, /* tp_str */
9947 PyObject_GenericGetAttr, /* tp_getattro */
9948 0, /* tp_setattro */
9949 0, /* tp_as_buffer */
9950 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9951 0, /* tp_doc */
9952 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9953 0, /* tp_clear */
9954 0, /* tp_richcompare */
9955 0, /* tp_weaklistoffset */
9956 PyObject_SelfIter, /* tp_iter */
9957 (iternextfunc)unicodeiter_next, /* tp_iternext */
9958 unicodeiter_methods, /* tp_methods */
9959 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009960};
9961
9962static PyObject *
9963unicode_iter(PyObject *seq)
9964{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009965 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009966
Benjamin Peterson14339b62009-01-31 16:36:08 +00009967 if (!PyUnicode_Check(seq)) {
9968 PyErr_BadInternalCall();
9969 return NULL;
9970 }
9971 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9972 if (it == NULL)
9973 return NULL;
9974 it->it_index = 0;
9975 Py_INCREF(seq);
9976 it->it_seq = (PyUnicodeObject *)seq;
9977 _PyObject_GC_TRACK(it);
9978 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009979}
9980
Martin v. Löwis5b222132007-06-10 09:51:05 +00009981size_t
9982Py_UNICODE_strlen(const Py_UNICODE *u)
9983{
9984 int res = 0;
9985 while(*u++)
9986 res++;
9987 return res;
9988}
9989
9990Py_UNICODE*
9991Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9992{
9993 Py_UNICODE *u = s1;
9994 while ((*u++ = *s2++));
9995 return s1;
9996}
9997
9998Py_UNICODE*
9999Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10000{
10001 Py_UNICODE *u = s1;
10002 while ((*u++ = *s2++))
10003 if (n-- == 0)
10004 break;
10005 return s1;
10006}
10007
10008int
10009Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10010{
10011 while (*s1 && *s2 && *s1 == *s2)
10012 s1++, s2++;
10013 if (*s1 && *s2)
10014 return (*s1 < *s2) ? -1 : +1;
10015 if (*s1)
10016 return 1;
10017 if (*s2)
10018 return -1;
10019 return 0;
10020}
10021
10022Py_UNICODE*
10023Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10024{
10025 const Py_UNICODE *p;
10026 for (p = s; *p; p++)
10027 if (*p == c)
10028 return (Py_UNICODE*)p;
10029 return NULL;
10030}
10031
10032
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010033#ifdef __cplusplus
10034}
10035#endif
10036
10037
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010038/*
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 Local variables:
10040 c-basic-offset: 4
10041 indent-tabs-mode: nil
10042 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010043*/