blob: 3d49db162b473fb64f5c53e531783a1a083ac10e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000146
Benjamin Peterson14339b62009-01-31 16:36:08 +0000147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000155};
156
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000157static PyObject *unicode_encode_call_errorhandler(const char *errors,
158 PyObject **errorHandler,const char *encoding, const char *reason,
159 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161
Christian Heimes190d79e2008-01-30 11:58:22 +0000162/* Same for linebreaks */
163static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000A, * LINE FEED */
166/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 1, 0, 0, 1, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000169/* 0x001C, * FILE SEPARATOR */
170/* 0x001D, * GROUP SEPARATOR */
171/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000172 0, 0, 0, 0, 1, 1, 1, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000177
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000186};
187
188
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000190PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000191{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000192#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000195 /* This is actually an illegal character, so it should
196 not be passed to unichr. */
197 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198#endif
199}
200
Thomas Wouters477c8d52006-05-27 19:21:47 +0000201/* --- Bloom Filters ----------------------------------------------------- */
202
203/* stuff to implement simple "bloom filters" for Unicode characters.
204 to keep things simple, we use a single bitmask, using the least 5
205 bits from each unicode characters as the bit index. */
206
207/* the linebreak mask is set up by Unicode_Init below */
208
209#define BLOOM_MASK unsigned long
210
211static BLOOM_MASK bloom_linebreak;
212
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000213#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
214#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215
Benjamin Peterson29060642009-01-31 22:14:21 +0000216#define BLOOM_LINEBREAK(ch) \
217 ((ch) < 128U ? ascii_linebreak[(ch)] : \
218 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000219
220Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
221{
222 /* calculate simple bloom-style bitmask for a given unicode string */
223
224 long mask;
225 Py_ssize_t i;
226
227 mask = 0;
228 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000229 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230
231 return mask;
232}
233
234Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
235{
236 Py_ssize_t i;
237
238 for (i = 0; i < setlen; i++)
239 if (set[i] == chr)
240 return 1;
241
242 return 0;
243}
244
Benjamin Peterson29060642009-01-31 22:14:21 +0000245#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000246 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
247
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248/* --- Unicode Object ----------------------------------------------------- */
249
250static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000251int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000252 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253{
254 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000255
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000256 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260 /* Resizing shared object (unicode_empty or single character
261 objects) in-place is not allowed. Use PyUnicode_Resize()
262 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000263
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000265 (unicode->length == 1 &&
266 unicode->str[0] < 256U &&
267 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000269 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 return -1;
271 }
272
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273 /* We allocate one more byte to make sure the string is Ux0000 terminated.
274 The overallocation is also used by fastsearch, which assumes that it's
275 safe to look at str[length] (without making any assumptions about what
276 it contains). */
277
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000279 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000280 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000281 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000282 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000283 PyErr_NoMemory();
284 return -1;
285 }
286 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000291 if (unicode->defenc) {
292 Py_DECREF(unicode->defenc);
293 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294 }
295 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000296
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 return 0;
298}
299
300/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000301 Ux0000 terminated; some code (e.g. new_identifier)
302 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303
304 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000305 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306
307*/
308
309static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000310PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311{
312 register PyUnicodeObject *unicode;
313
Thomas Wouters477c8d52006-05-27 19:21:47 +0000314 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 if (length == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return unicode_empty;
318 }
319
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000320 /* Ensure we won't overflow the size. */
321 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
322 return (PyUnicodeObject *)PyErr_NoMemory();
323 }
324
Guido van Rossumd57fd912000-03-10 22:53:23 +0000325 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000326 if (free_list) {
327 unicode = free_list;
328 free_list = *(PyUnicodeObject **)unicode;
329 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000330 if (unicode->str) {
331 /* Keep-Alive optimization: we only upsize the buffer,
332 never downsize it. */
333 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000334 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 PyObject_DEL(unicode->str);
336 unicode->str = NULL;
337 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000338 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000339 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
341 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000342 }
343 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000344 }
345 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000346 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000347 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 if (unicode == NULL)
349 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000350 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
351 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000354 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 PyErr_NoMemory();
356 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000357 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000358 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000359 * the caller fails before initializing str -- unicode_resize()
360 * reads str[0], and the Keep-Alive optimization can keep memory
361 * allocated for str alive across a call to unicode_dealloc(unicode).
362 * We don't want unicode_resize to read uninitialized memory in
363 * that case.
364 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000368 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000369 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000370 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000374 /* XXX UNREF/NEWREF interface should be more symmetrical */
375 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000376 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000377 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379}
380
381static
Guido van Rossum9475a232001-10-05 20:51:39 +0000382void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000383{
Walter Dörwald16807132007-05-25 13:52:07 +0000384 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000385 case SSTATE_NOT_INTERNED:
386 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000387
Benjamin Peterson29060642009-01-31 22:14:21 +0000388 case SSTATE_INTERNED_MORTAL:
389 /* revive dead object temporarily for DelItem */
390 Py_REFCNT(unicode) = 3;
391 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
392 Py_FatalError(
393 "deletion of interned string failed");
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_IMMORTAL:
397 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000398
Benjamin Peterson29060642009-01-31 22:14:21 +0000399 default:
400 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000401 }
402
Guido van Rossum604ddf82001-12-06 20:03:56 +0000403 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000405 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
407 PyObject_DEL(unicode->str);
408 unicode->str = NULL;
409 unicode->length = 0;
410 }
411 if (unicode->defenc) {
412 Py_DECREF(unicode->defenc);
413 unicode->defenc = NULL;
414 }
415 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000416 *(PyUnicodeObject **)unicode = free_list;
417 free_list = unicode;
418 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000419 }
420 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 PyObject_DEL(unicode->str);
422 Py_XDECREF(unicode->defenc);
423 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000424 }
425}
426
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000427static
428int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429{
430 register PyUnicodeObject *v;
431
432 /* Argument checks */
433 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000434 PyErr_BadInternalCall();
435 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000437 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000438 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000439 PyErr_BadInternalCall();
440 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 }
442
443 /* Resizing unicode_empty and single character objects is not
444 possible since these are being shared. We simply return a fresh
445 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000446 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 (v == unicode_empty || v->length == 1)) {
448 PyUnicodeObject *w = _PyUnicode_New(length);
449 if (w == NULL)
450 return -1;
451 Py_UNICODE_COPY(w->str, v->str,
452 length < v->length ? length : v->length);
453 Py_DECREF(*unicode);
454 *unicode = w;
455 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000456 }
457
458 /* Note that we don't have to modify *unicode for unshared Unicode
459 objects, since we can modify them in-place. */
460 return unicode_resize(v, length);
461}
462
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000463int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
464{
465 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
466}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000469 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470{
471 PyUnicodeObject *unicode;
472
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000473 /* If the Unicode data is known at construction time, we can apply
474 some optimizations which share commonly used objects. */
475 if (u != NULL) {
476
Benjamin Peterson29060642009-01-31 22:14:21 +0000477 /* Optimization for empty strings */
478 if (size == 0 && unicode_empty != NULL) {
479 Py_INCREF(unicode_empty);
480 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000482
483 /* Single character Unicode objects in the Latin-1 range are
484 shared when using this constructor */
485 if (size == 1 && *u < 256) {
486 unicode = unicode_latin1[*u];
487 if (!unicode) {
488 unicode = _PyUnicode_New(1);
489 if (!unicode)
490 return NULL;
491 unicode->str[0] = *u;
492 unicode_latin1[*u] = unicode;
493 }
494 Py_INCREF(unicode);
495 return (PyObject *)unicode;
496 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000497 }
Tim Petersced69f82003-09-16 20:30:58 +0000498
Guido van Rossumd57fd912000-03-10 22:53:23 +0000499 unicode = _PyUnicode_New(size);
500 if (!unicode)
501 return NULL;
502
503 /* Copy the Unicode data into the new object */
504 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000505 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506
507 return (PyObject *)unicode;
508}
509
Walter Dörwaldd2034312007-05-18 16:29:38 +0000510PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000511{
512 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000513
Benjamin Peterson14339b62009-01-31 16:36:08 +0000514 if (size < 0) {
515 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000516 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000517 return NULL;
518 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000519
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000520 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000521 some optimizations which share commonly used objects.
522 Also, this means the input must be UTF-8, so fall back to the
523 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 if (u != NULL) {
525
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 /* Optimization for empty strings */
527 if (size == 0 && unicode_empty != NULL) {
528 Py_INCREF(unicode_empty);
529 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000530 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000531
532 /* Single characters are shared when using this constructor.
533 Restrict to ASCII, since the input must be UTF-8. */
534 if (size == 1 && Py_CHARMASK(*u) < 128) {
535 unicode = unicode_latin1[Py_CHARMASK(*u)];
536 if (!unicode) {
537 unicode = _PyUnicode_New(1);
538 if (!unicode)
539 return NULL;
540 unicode->str[0] = Py_CHARMASK(*u);
541 unicode_latin1[Py_CHARMASK(*u)] = unicode;
542 }
543 Py_INCREF(unicode);
544 return (PyObject *)unicode;
545 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000546
547 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 }
549
Walter Dörwald55507312007-05-18 13:12:10 +0000550 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000551 if (!unicode)
552 return NULL;
553
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000554 return (PyObject *)unicode;
555}
556
Walter Dörwaldd2034312007-05-18 16:29:38 +0000557PyObject *PyUnicode_FromString(const char *u)
558{
559 size_t size = strlen(u);
560 if (size > PY_SSIZE_T_MAX) {
561 PyErr_SetString(PyExc_OverflowError, "input too long");
562 return NULL;
563 }
564
565 return PyUnicode_FromStringAndSize(u, size);
566}
567
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568#ifdef HAVE_WCHAR_H
569
Mark Dickinson081dfee2009-03-18 14:47:41 +0000570#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
571# define CONVERT_WCHAR_TO_SURROGATES
572#endif
573
574#ifdef CONVERT_WCHAR_TO_SURROGATES
575
576/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
577 to convert from UTF32 to UTF16. */
578
579PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
580 Py_ssize_t size)
581{
582 PyUnicodeObject *unicode;
583 register Py_ssize_t i;
584 Py_ssize_t alloc;
585 const wchar_t *orig_w;
586
587 if (w == NULL) {
588 if (size == 0)
589 return PyUnicode_FromStringAndSize(NULL, 0);
590 PyErr_BadInternalCall();
591 return NULL;
592 }
593
594 if (size == -1) {
595 size = wcslen(w);
596 }
597
598 alloc = size;
599 orig_w = w;
600 for (i = size; i > 0; i--) {
601 if (*w > 0xFFFF)
602 alloc++;
603 w++;
604 }
605 w = orig_w;
606 unicode = _PyUnicode_New(alloc);
607 if (!unicode)
608 return NULL;
609
610 /* Copy the wchar_t data into the new object */
611 {
612 register Py_UNICODE *u;
613 u = PyUnicode_AS_UNICODE(unicode);
614 for (i = size; i > 0; i--) {
615 if (*w > 0xFFFF) {
616 wchar_t ordinal = *w++;
617 ordinal -= 0x10000;
618 *u++ = 0xD800 | (ordinal >> 10);
619 *u++ = 0xDC00 | (ordinal & 0x3FF);
620 }
621 else
622 *u++ = *w++;
623 }
624 }
625 return (PyObject *)unicode;
626}
627
628#else
629
Guido van Rossumd57fd912000-03-10 22:53:23 +0000630PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000631 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632{
633 PyUnicodeObject *unicode;
634
635 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000636 if (size == 0)
637 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 PyErr_BadInternalCall();
639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640 }
641
Martin v. Löwis790465f2008-04-05 20:41:37 +0000642 if (size == -1) {
643 size = wcslen(w);
644 }
645
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646 unicode = _PyUnicode_New(size);
647 if (!unicode)
648 return NULL;
649
650 /* Copy the wchar_t data into the new object */
651#ifdef HAVE_USABLE_WCHAR_T
652 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000653#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000655 register Py_UNICODE *u;
656 register Py_ssize_t i;
657 u = PyUnicode_AS_UNICODE(unicode);
658 for (i = size; i > 0; i--)
659 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000660 }
661#endif
662
663 return (PyObject *)unicode;
664}
665
Mark Dickinson081dfee2009-03-18 14:47:41 +0000666#endif /* CONVERT_WCHAR_TO_SURROGATES */
667
668#undef CONVERT_WCHAR_TO_SURROGATES
669
Walter Dörwald346737f2007-05-31 10:44:43 +0000670static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000671makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
672 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000673{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000684 else if (longlongflag) {
685 /* longlongflag should only ever be nonzero on machines with
686 HAVE_LONG_LONG defined */
687#ifdef HAVE_LONG_LONG
688 char *f = PY_FORMAT_LONG_LONG;
689 while (*f)
690 *fmt++ = *f++;
691#else
692 /* we shouldn't ever get here */
693 assert(0);
694 *fmt++ = 'l';
695#endif
696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000697 else if (size_tflag) {
698 char *f = PY_FORMAT_SIZE_T;
699 while (*f)
700 *fmt++ = *f++;
701 }
702 *fmt++ = c;
703 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000704}
705
Walter Dörwaldd2034312007-05-18 16:29:38 +0000706#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
707
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000708/* size of fixed-size buffer for formatting single arguments */
709#define ITEM_BUFFER_LEN 21
710/* maximum number of characters required for output of %ld. 21 characters
711 allows for 64-bit integers (in decimal) and an optional sign. */
712#define MAX_LONG_CHARS 21
713/* maximum number of characters required for output of %lld.
714 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
715 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
716#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
717
Walter Dörwaldd2034312007-05-18 16:29:38 +0000718PyObject *
719PyUnicode_FromFormatV(const char *format, va_list vargs)
720{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000721 va_list count;
722 Py_ssize_t callcount = 0;
723 PyObject **callresults = NULL;
724 PyObject **callresult = NULL;
725 Py_ssize_t n = 0;
726 int width = 0;
727 int precision = 0;
728 int zeropad;
729 const char* f;
730 Py_UNICODE *s;
731 PyObject *string;
732 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000733 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000734 /* use abuffer instead of buffer, if we need more space
735 * (which can happen if there's a format specifier with width). */
736 char *abuffer = NULL;
737 char *realbuffer;
738 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000739 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000740 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000741
742#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson14339b62009-01-31 16:36:08 +0000743 Py_MEMCPY(count, vargs, sizeof(va_list));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744#else
745#ifdef __va_copy
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 __va_copy(count, vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000748 count = vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000749#endif
750#endif
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000751 /* step 1: count the number of %S/%R/%A/%s format specifications
752 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
753 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
754 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000755 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 if (*f == '%') {
757 if (*(f+1)=='%')
758 continue;
759 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
760 ++callcount;
761 while (ISDIGIT((unsigned)*f))
762 width = (width*10) + *f++ - '0';
763 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
764 ;
765 if (*f == 's')
766 ++callcount;
767 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000768 }
769 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000770 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 if (callcount) {
772 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
773 if (!callresults) {
774 PyErr_NoMemory();
775 return NULL;
776 }
777 callresult = callresults;
778 }
779 /* step 3: figure out how large a buffer we need */
780 for (f = format; *f; f++) {
781 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000782#ifdef HAVE_LONG_LONG
783 int longlongflag = 0;
784#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000785 const char* p = f;
786 width = 0;
787 while (ISDIGIT((unsigned)*f))
788 width = (width*10) + *f++ - '0';
789 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
790 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000791
Benjamin Peterson14339b62009-01-31 16:36:08 +0000792 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
793 * they don't affect the amount of space we reserve.
794 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000795 if (*f == 'l') {
796 if (f[1] == 'd' || f[1] == 'u') {
797 ++f;
798 }
799#ifdef HAVE_LONG_LONG
800 else if (f[1] == 'l' &&
801 (f[2] == 'd' || f[2] == 'u')) {
802 longlongflag = 1;
803 f += 2;
804 }
805#endif
806 }
807 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000808 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000810
Benjamin Peterson14339b62009-01-31 16:36:08 +0000811 switch (*f) {
812 case 'c':
813 (void)va_arg(count, int);
814 /* fall through... */
815 case '%':
816 n++;
817 break;
818 case 'd': case 'u': case 'i': case 'x':
819 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000820#ifdef HAVE_LONG_LONG
821 if (longlongflag) {
822 if (width < MAX_LONG_LONG_CHARS)
823 width = MAX_LONG_LONG_CHARS;
824 }
825 else
826#endif
827 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
828 including sign. Decimal takes the most space. This
829 isn't enough for octal. If a width is specified we
830 need more (which we allocate later). */
831 if (width < MAX_LONG_CHARS)
832 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000833 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000835 if (abuffersize < width)
836 abuffersize = width;
837 break;
838 case 's':
839 {
840 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000841 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000842 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
843 if (!str)
844 goto fail;
845 n += PyUnicode_GET_SIZE(str);
846 /* Remember the str and switch to the next slot */
847 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000848 break;
849 }
850 case 'U':
851 {
852 PyObject *obj = va_arg(count, PyObject *);
853 assert(obj && PyUnicode_Check(obj));
854 n += PyUnicode_GET_SIZE(obj);
855 break;
856 }
857 case 'V':
858 {
859 PyObject *obj = va_arg(count, PyObject *);
860 const char *str = va_arg(count, const char *);
861 assert(obj || str);
862 assert(!obj || PyUnicode_Check(obj));
863 if (obj)
864 n += PyUnicode_GET_SIZE(obj);
865 else
866 n += strlen(str);
867 break;
868 }
869 case 'S':
870 {
871 PyObject *obj = va_arg(count, PyObject *);
872 PyObject *str;
873 assert(obj);
874 str = PyObject_Str(obj);
875 if (!str)
876 goto fail;
877 n += PyUnicode_GET_SIZE(str);
878 /* Remember the str and switch to the next slot */
879 *callresult++ = str;
880 break;
881 }
882 case 'R':
883 {
884 PyObject *obj = va_arg(count, PyObject *);
885 PyObject *repr;
886 assert(obj);
887 repr = PyObject_Repr(obj);
888 if (!repr)
889 goto fail;
890 n += PyUnicode_GET_SIZE(repr);
891 /* Remember the repr and switch to the next slot */
892 *callresult++ = repr;
893 break;
894 }
895 case 'A':
896 {
897 PyObject *obj = va_arg(count, PyObject *);
898 PyObject *ascii;
899 assert(obj);
900 ascii = PyObject_ASCII(obj);
901 if (!ascii)
902 goto fail;
903 n += PyUnicode_GET_SIZE(ascii);
904 /* Remember the repr and switch to the next slot */
905 *callresult++ = ascii;
906 break;
907 }
908 case 'p':
909 (void) va_arg(count, int);
910 /* maximum 64-bit pointer representation:
911 * 0xffffffffffffffff
912 * so 19 characters is enough.
913 * XXX I count 18 -- what's the extra for?
914 */
915 n += 19;
916 break;
917 default:
918 /* if we stumble upon an unknown
919 formatting code, copy the rest of
920 the format string to the output
921 string. (we cannot just skip the
922 code, since there's no way to know
923 what's in the argument list) */
924 n += strlen(p);
925 goto expand;
926 }
927 } else
928 n++;
929 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000930 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000931 if (abuffersize > ITEM_BUFFER_LEN) {
932 /* add 1 for sprintf's trailing null byte */
933 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000934 if (!abuffer) {
935 PyErr_NoMemory();
936 goto fail;
937 }
938 realbuffer = abuffer;
939 }
940 else
941 realbuffer = buffer;
942 /* step 4: fill the buffer */
943 /* Since we've analyzed how much space we need for the worst case,
944 we don't have to resize the string.
945 There can be no errors beyond this point. */
946 string = PyUnicode_FromUnicode(NULL, n);
947 if (!string)
948 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000949
Benjamin Peterson14339b62009-01-31 16:36:08 +0000950 s = PyUnicode_AS_UNICODE(string);
951 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000952
Benjamin Peterson14339b62009-01-31 16:36:08 +0000953 for (f = format; *f; f++) {
954 if (*f == '%') {
955 const char* p = f++;
956 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000957 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 int size_tflag = 0;
959 zeropad = (*f == '0');
960 /* parse the width.precision part */
961 width = 0;
962 while (ISDIGIT((unsigned)*f))
963 width = (width*10) + *f++ - '0';
964 precision = 0;
965 if (*f == '.') {
966 f++;
967 while (ISDIGIT((unsigned)*f))
968 precision = (precision*10) + *f++ - '0';
969 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000970 /* Handle %ld, %lu, %lld and %llu. */
971 if (*f == 'l') {
972 if (f[1] == 'd' || f[1] == 'u') {
973 longflag = 1;
974 ++f;
975 }
976#ifdef HAVE_LONG_LONG
977 else if (f[1] == 'l' &&
978 (f[2] == 'd' || f[2] == 'u')) {
979 longlongflag = 1;
980 f += 2;
981 }
982#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000983 }
984 /* handle the size_t flag. */
985 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
986 size_tflag = 1;
987 ++f;
988 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000989
Benjamin Peterson14339b62009-01-31 16:36:08 +0000990 switch (*f) {
991 case 'c':
992 *s++ = va_arg(vargs, int);
993 break;
994 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000995 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
996 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +0000997 if (longflag)
998 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999#ifdef HAVE_LONG_LONG
1000 else if (longlongflag)
1001 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1002#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 else if (size_tflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1005 else
1006 sprintf(realbuffer, fmt, va_arg(vargs, int));
1007 appendstring(realbuffer);
1008 break;
1009 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001010 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1011 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001012 if (longflag)
1013 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001014#ifdef HAVE_LONG_LONG
1015 else if (longlongflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs,
1017 unsigned PY_LONG_LONG));
1018#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001019 else if (size_tflag)
1020 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1021 else
1022 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1023 appendstring(realbuffer);
1024 break;
1025 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001026 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 sprintf(realbuffer, fmt, va_arg(vargs, int));
1028 appendstring(realbuffer);
1029 break;
1030 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001032 sprintf(realbuffer, fmt, va_arg(vargs, int));
1033 appendstring(realbuffer);
1034 break;
1035 case 's':
1036 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001037 /* unused, since we already have the result */
1038 (void) va_arg(vargs, char *);
1039 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1040 PyUnicode_GET_SIZE(*callresult));
1041 s += PyUnicode_GET_SIZE(*callresult);
1042 /* We're done with the unicode()/repr() => forget it */
1043 Py_DECREF(*callresult);
1044 /* switch to next unicode()/repr() result */
1045 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001046 break;
1047 }
1048 case 'U':
1049 {
1050 PyObject *obj = va_arg(vargs, PyObject *);
1051 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1052 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1053 s += size;
1054 break;
1055 }
1056 case 'V':
1057 {
1058 PyObject *obj = va_arg(vargs, PyObject *);
1059 const char *str = va_arg(vargs, const char *);
1060 if (obj) {
1061 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1062 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1063 s += size;
1064 } else {
1065 appendstring(str);
1066 }
1067 break;
1068 }
1069 case 'S':
1070 case 'R':
1071 {
1072 Py_UNICODE *ucopy;
1073 Py_ssize_t usize;
1074 Py_ssize_t upos;
1075 /* unused, since we already have the result */
1076 (void) va_arg(vargs, PyObject *);
1077 ucopy = PyUnicode_AS_UNICODE(*callresult);
1078 usize = PyUnicode_GET_SIZE(*callresult);
1079 for (upos = 0; upos<usize;)
1080 *s++ = ucopy[upos++];
1081 /* We're done with the unicode()/repr() => forget it */
1082 Py_DECREF(*callresult);
1083 /* switch to next unicode()/repr() result */
1084 ++callresult;
1085 break;
1086 }
1087 case 'p':
1088 sprintf(buffer, "%p", va_arg(vargs, void*));
1089 /* %p is ill-defined: ensure leading 0x. */
1090 if (buffer[1] == 'X')
1091 buffer[1] = 'x';
1092 else if (buffer[1] != 'x') {
1093 memmove(buffer+2, buffer, strlen(buffer)+1);
1094 buffer[0] = '0';
1095 buffer[1] = 'x';
1096 }
1097 appendstring(buffer);
1098 break;
1099 case '%':
1100 *s++ = '%';
1101 break;
1102 default:
1103 appendstring(p);
1104 goto end;
1105 }
1106 } else
1107 *s++ = *f;
1108 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001109
Benjamin Peterson29060642009-01-31 22:14:21 +00001110 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001111 if (callresults)
1112 PyObject_Free(callresults);
1113 if (abuffer)
1114 PyObject_Free(abuffer);
1115 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1116 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults) {
1119 PyObject **callresult2 = callresults;
1120 while (callresult2 < callresult) {
1121 Py_DECREF(*callresult2);
1122 ++callresult2;
1123 }
1124 PyObject_Free(callresults);
1125 }
1126 if (abuffer)
1127 PyObject_Free(abuffer);
1128 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001129}
1130
1131#undef appendstring
1132
1133PyObject *
1134PyUnicode_FromFormat(const char *format, ...)
1135{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001136 PyObject* ret;
1137 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001138
1139#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001140 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001141#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001142 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001143#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 ret = PyUnicode_FromFormatV(format, vargs);
1145 va_end(vargs);
1146 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147}
1148
Martin v. Löwis18e16552006-02-15 17:27:45 +00001149Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001150 wchar_t *w,
1151 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152{
1153 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001154 PyErr_BadInternalCall();
1155 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001157
1158 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001161
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162#ifdef HAVE_USABLE_WCHAR_T
1163 memcpy(w, unicode->str, size * sizeof(wchar_t));
1164#else
1165 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 register Py_UNICODE *u;
1167 register Py_ssize_t i;
1168 u = PyUnicode_AS_UNICODE(unicode);
1169 for (i = size; i > 0; i--)
1170 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 }
1172#endif
1173
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001174 if (size > PyUnicode_GET_SIZE(unicode))
1175 return PyUnicode_GET_SIZE(unicode);
1176 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001177 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178}
1179
1180#endif
1181
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001182PyObject *PyUnicode_FromOrdinal(int ordinal)
1183{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001184 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001185
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001186 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001187 PyErr_SetString(PyExc_ValueError,
1188 "chr() arg not in range(0x110000)");
1189 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001190 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001191
1192#ifndef Py_UNICODE_WIDE
1193 if (ordinal > 0xffff) {
1194 ordinal -= 0x10000;
1195 s[0] = 0xD800 | (ordinal >> 10);
1196 s[1] = 0xDC00 | (ordinal & 0x3FF);
1197 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001198 }
1199#endif
1200
Hye-Shik Chang40574832004-04-06 07:24:51 +00001201 s[0] = (Py_UNICODE)ordinal;
1202 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001203}
1204
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205PyObject *PyUnicode_FromObject(register PyObject *obj)
1206{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001207 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001208 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001209 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001210 Py_INCREF(obj);
1211 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001212 }
1213 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001214 /* For a Unicode subtype that's not a Unicode object,
1215 return a true Unicode object with the same data. */
1216 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1217 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001218 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001219 PyErr_Format(PyExc_TypeError,
1220 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001221 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001222 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001223}
1224
1225PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 const char *encoding,
1227 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001228{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001229 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001230 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001231 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001232
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 PyErr_BadInternalCall();
1235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001237
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001238 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001239 PyErr_SetString(PyExc_TypeError,
1240 "decoding str is not supported");
1241 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001242 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001243
1244 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001245 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001246 s = PyBytes_AS_STRING(obj);
1247 len = PyBytes_GET_SIZE(obj);
1248 }
1249 else if (PyByteArray_Check(obj)) {
1250 s = PyByteArray_AS_STRING(obj);
1251 len = PyByteArray_GET_SIZE(obj);
1252 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001253 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001254 /* Overwrite the error message with something more useful in
1255 case of a TypeError. */
1256 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001257 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001258 "coercing to str: need string or buffer, "
1259 "%.80s found",
1260 Py_TYPE(obj)->tp_name);
1261 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001262 }
Tim Petersced69f82003-09-16 20:30:58 +00001263
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001264 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 if (len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001266 Py_INCREF(unicode_empty);
1267 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 }
Tim Petersced69f82003-09-16 20:30:58 +00001269 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001270 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001271
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001272 return v;
1273
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276}
1277
1278PyObject *PyUnicode_Decode(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001279 Py_ssize_t size,
1280 const char *encoding,
1281 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282{
1283 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001284 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001285 char lower[20]; /* Enough for any encoding name we recognize */
1286 char *l;
1287 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001288
1289 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001290 encoding = PyUnicode_GetDefaultEncoding();
1291
1292 /* Convert encoding to lower case and replace '_' with '-' in order to
1293 catch e.g. UTF_8 */
1294 e = encoding;
1295 l = lower;
1296 while (*e && l < &lower[(sizeof lower) - 2]) {
1297 if (ISUPPER(*e)) {
1298 *l++ = TOLOWER(*e++);
1299 }
1300 else if (*e == '_') {
1301 *l++ = '-';
1302 e++;
1303 }
1304 else {
1305 *l++ = *e++;
1306 }
1307 }
1308 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001309
1310 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001311 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001313 else if ((strcmp(lower, "latin-1") == 0) ||
1314 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001315 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001316#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001317 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001318 return PyUnicode_DecodeMBCS(s, size, errors);
1319#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001320 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001321 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001322 else if (strcmp(lower, "utf-16") == 0)
1323 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1324 else if (strcmp(lower, "utf-32") == 0)
1325 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326
1327 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001328 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001329 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001330 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001331 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 if (buffer == NULL)
1333 goto onError;
1334 unicode = PyCodec_Decode(buffer, encoding, errors);
1335 if (unicode == NULL)
1336 goto onError;
1337 if (!PyUnicode_Check(unicode)) {
1338 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001339 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001340 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 Py_DECREF(unicode);
1342 goto onError;
1343 }
1344 Py_DECREF(buffer);
1345 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001346
Benjamin Peterson29060642009-01-31 22:14:21 +00001347 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 Py_XDECREF(buffer);
1349 return NULL;
1350}
1351
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001352PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1353 const char *encoding,
1354 const char *errors)
1355{
1356 PyObject *v;
1357
1358 if (!PyUnicode_Check(unicode)) {
1359 PyErr_BadArgument();
1360 goto onError;
1361 }
1362
1363 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001365
1366 /* Decode via the codec registry */
1367 v = PyCodec_Decode(unicode, encoding, errors);
1368 if (v == NULL)
1369 goto onError;
1370 return v;
1371
Benjamin Peterson29060642009-01-31 22:14:21 +00001372 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001373 return NULL;
1374}
1375
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001376PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1377 const char *encoding,
1378 const char *errors)
1379{
1380 PyObject *v;
1381
1382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 goto onError;
1385 }
1386
1387 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001388 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001389
1390 /* Decode via the codec registry */
1391 v = PyCodec_Decode(unicode, encoding, errors);
1392 if (v == NULL)
1393 goto onError;
1394 if (!PyUnicode_Check(v)) {
1395 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001396 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001397 Py_TYPE(v)->tp_name);
1398 Py_DECREF(v);
1399 goto onError;
1400 }
1401 return v;
1402
Benjamin Peterson29060642009-01-31 22:14:21 +00001403 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001404 return NULL;
1405}
1406
Guido van Rossumd57fd912000-03-10 22:53:23 +00001407PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001408 Py_ssize_t size,
1409 const char *encoding,
1410 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411{
1412 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001413
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 unicode = PyUnicode_FromUnicode(s, size);
1415 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1418 Py_DECREF(unicode);
1419 return v;
1420}
1421
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001422PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1423 const char *encoding,
1424 const char *errors)
1425{
1426 PyObject *v;
1427
1428 if (!PyUnicode_Check(unicode)) {
1429 PyErr_BadArgument();
1430 goto onError;
1431 }
1432
1433 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001435
1436 /* Encode via the codec registry */
1437 v = PyCodec_Encode(unicode, encoding, errors);
1438 if (v == NULL)
1439 goto onError;
1440 return v;
1441
Benjamin Peterson29060642009-01-31 22:14:21 +00001442 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001443 return NULL;
1444}
1445
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1447 const char *encoding,
1448 const char *errors)
1449{
1450 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001451
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452 if (!PyUnicode_Check(unicode)) {
1453 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 }
Fred Drakee4315f52000-05-09 19:53:39 +00001456
Tim Petersced69f82003-09-16 20:30:58 +00001457 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001458 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001459
1460 /* Shortcuts for common default encodings */
1461 if (errors == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001462 if (strcmp(encoding, "utf-8") == 0)
1463 return PyUnicode_AsUTF8String(unicode);
1464 else if (strcmp(encoding, "latin-1") == 0)
1465 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001466#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Peterson29060642009-01-31 22:14:21 +00001467 else if (strcmp(encoding, "mbcs") == 0)
1468 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001469#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001470 else if (strcmp(encoding, "ascii") == 0)
1471 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001472 /* During bootstrap, we may need to find the encodings
1473 package, to load the file system encoding, and require the
1474 file system encoding in order to load the encodings
1475 package.
1476
1477 Break out of this dependency by assuming that the path to
1478 the encodings module is ASCII-only. XXX could try wcstombs
1479 instead, if the file system encoding is the locale's
1480 encoding. */
1481 else if (Py_FileSystemDefaultEncoding &&
1482 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1483 !PyThreadState_GET()->interp->codecs_initialized)
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001485 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486
1487 /* Encode via the codec registry */
1488 v = PyCodec_Encode(unicode, encoding, errors);
1489 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001490 return NULL;
1491
1492 /* The normal path */
1493 if (PyBytes_Check(v))
1494 return v;
1495
1496 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001497 if (PyByteArray_Check(v)) {
1498 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001499 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001500 PyOS_snprintf(msg, sizeof(msg),
1501 "encoder %s returned buffer instead of bytes",
1502 encoding);
1503 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001504 Py_DECREF(v);
1505 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001506 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001507
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001508 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1509 Py_DECREF(v);
1510 return b;
1511 }
1512
1513 PyErr_Format(PyExc_TypeError,
1514 "encoder did not return a bytes object (type=%.400s)",
1515 Py_TYPE(v)->tp_name);
1516 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001517 return NULL;
1518}
1519
1520PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1521 const char *encoding,
1522 const char *errors)
1523{
1524 PyObject *v;
1525
1526 if (!PyUnicode_Check(unicode)) {
1527 PyErr_BadArgument();
1528 goto onError;
1529 }
1530
1531 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001532 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001533
1534 /* Encode via the codec registry */
1535 v = PyCodec_Encode(unicode, encoding, errors);
1536 if (v == NULL)
1537 goto onError;
1538 if (!PyUnicode_Check(v)) {
1539 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001540 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001541 Py_TYPE(v)->tp_name);
1542 Py_DECREF(v);
1543 goto onError;
1544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001546
Benjamin Peterson29060642009-01-31 22:14:21 +00001547 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return NULL;
1549}
1550
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001551PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001552 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001553{
1554 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001555 if (v)
1556 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001557 if (errors != NULL)
1558 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001559 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001560 PyUnicode_GET_SIZE(unicode),
1561 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001562 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001563 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001564 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001565 return v;
1566}
1567
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001568PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001569PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001570 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001571 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1572}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001573
Christian Heimes5894ba72007-11-04 11:43:14 +00001574PyObject*
1575PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1576{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001577 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1578 can be undefined. If it is case, decode using UTF-8. The following assumes
1579 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1580 bootstrapping process where the codecs aren't ready yet.
1581 */
1582 if (Py_FileSystemDefaultEncoding) {
1583#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001584 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001585 return PyUnicode_DecodeMBCS(s, size, "replace");
1586 }
1587#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001588 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001589 return PyUnicode_DecodeUTF8(s, size, "replace");
1590 }
1591#endif
1592 return PyUnicode_Decode(s, size,
1593 Py_FileSystemDefaultEncoding,
1594 "replace");
1595 }
1596 else {
1597 return PyUnicode_DecodeUTF8(s, size, "replace");
1598 }
1599}
1600
Martin v. Löwis011e8422009-05-05 04:43:17 +00001601/* Convert the argument to a bytes object, according to the file
1602 system encoding */
1603
1604int
1605PyUnicode_FSConverter(PyObject* arg, void* addr)
1606{
1607 PyObject *output = NULL;
1608 Py_ssize_t size;
1609 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001610 if (arg == NULL) {
1611 Py_DECREF(*(PyObject**)addr);
1612 return 1;
1613 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00001614 if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
1615 output = arg;
1616 Py_INCREF(output);
1617 }
1618 else {
1619 arg = PyUnicode_FromObject(arg);
1620 if (!arg)
1621 return 0;
1622 output = PyUnicode_AsEncodedObject(arg,
1623 Py_FileSystemDefaultEncoding,
Martin v. Löwis43c57782009-05-10 08:15:24 +00001624 "surrogateescape");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001625 Py_DECREF(arg);
1626 if (!output)
1627 return 0;
1628 if (!PyBytes_Check(output)) {
1629 Py_DECREF(output);
1630 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1631 return 0;
1632 }
1633 }
1634 if (PyBytes_Check(output)) {
1635 size = PyBytes_GET_SIZE(output);
1636 data = PyBytes_AS_STRING(output);
1637 }
1638 else {
1639 size = PyByteArray_GET_SIZE(output);
1640 data = PyByteArray_AS_STRING(output);
1641 }
1642 if (size != strlen(data)) {
1643 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1644 Py_DECREF(output);
1645 return 0;
1646 }
1647 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001648 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001649}
1650
1651
Martin v. Löwis5b222132007-06-10 09:51:05 +00001652char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001653_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001654{
Christian Heimesf3863112007-11-22 07:46:41 +00001655 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001656 if (!PyUnicode_Check(unicode)) {
1657 PyErr_BadArgument();
1658 return NULL;
1659 }
Christian Heimesf3863112007-11-22 07:46:41 +00001660 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1661 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001662 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001663 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001664 *psize = PyBytes_GET_SIZE(bytes);
1665 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001666}
1667
1668char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001669_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001670{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001671 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001672}
1673
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1675{
1676 if (!PyUnicode_Check(unicode)) {
1677 PyErr_BadArgument();
1678 goto onError;
1679 }
1680 return PyUnicode_AS_UNICODE(unicode);
1681
Benjamin Peterson29060642009-01-31 22:14:21 +00001682 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 return NULL;
1684}
1685
Martin v. Löwis18e16552006-02-15 17:27:45 +00001686Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687{
1688 if (!PyUnicode_Check(unicode)) {
1689 PyErr_BadArgument();
1690 goto onError;
1691 }
1692 return PyUnicode_GET_SIZE(unicode);
1693
Benjamin Peterson29060642009-01-31 22:14:21 +00001694 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695 return -1;
1696}
1697
Thomas Wouters78890102000-07-22 19:25:51 +00001698const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001699{
1700 return unicode_default_encoding;
1701}
1702
1703int PyUnicode_SetDefaultEncoding(const char *encoding)
1704{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001705 if (strcmp(encoding, unicode_default_encoding) != 0) {
1706 PyErr_Format(PyExc_ValueError,
1707 "Can only set default encoding to %s",
1708 unicode_default_encoding);
1709 return -1;
1710 }
Fred Drakee4315f52000-05-09 19:53:39 +00001711 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001712}
1713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714/* error handling callback helper:
1715 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001716 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001717 and adjust various state variables.
1718 return 0 on success, -1 on error
1719*/
1720
1721static
1722int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 const char *encoding, const char *reason,
1724 const char **input, const char **inend, Py_ssize_t *startinpos,
1725 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1726 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001727{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001728 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729
1730 PyObject *restuple = NULL;
1731 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001732 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001733 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001734 Py_ssize_t requiredsize;
1735 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001736 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001737 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001738 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 int res = -1;
1740
1741 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 *errorHandler = PyCodec_LookupError(errors);
1743 if (*errorHandler == NULL)
1744 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001745 }
1746
1747 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00001748 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00001749 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1750 if (*exceptionObject == NULL)
1751 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 }
1753 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00001754 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1755 goto onError;
1756 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1757 goto onError;
1758 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1759 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 }
1761
1762 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1763 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001765 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001766 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001768 }
1769 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001770 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001771
1772 /* Copy back the bytes variables, which might have been modified by the
1773 callback */
1774 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1775 if (!inputobj)
1776 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001777 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001779 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001780 *input = PyBytes_AS_STRING(inputobj);
1781 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001782 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001783 /* we can DECREF safely, as the exception has another reference,
1784 so the object won't go away. */
1785 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001787 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001788 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001789 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001790 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1791 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001792 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001793
1794 /* need more space? (at least enough for what we
1795 have+the replacement+the rest of the string (starting
1796 at the new input position), so we won't have to check space
1797 when there are no errors in the rest of the string) */
1798 repptr = PyUnicode_AS_UNICODE(repunicode);
1799 repsize = PyUnicode_GET_SIZE(repunicode);
1800 requiredsize = *outpos + repsize + insize-newpos;
1801 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001802 if (requiredsize<2*outsize)
1803 requiredsize = 2*outsize;
1804 if (_PyUnicode_Resize(output, requiredsize) < 0)
1805 goto onError;
1806 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001807 }
1808 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001809 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001810 Py_UNICODE_COPY(*outptr, repptr, repsize);
1811 *outptr += repsize;
1812 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001814 /* we made it! */
1815 res = 0;
1816
Benjamin Peterson29060642009-01-31 22:14:21 +00001817 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818 Py_XDECREF(restuple);
1819 return res;
1820}
1821
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001822/* --- UTF-7 Codec -------------------------------------------------------- */
1823
Antoine Pitrou244651a2009-05-04 18:56:13 +00001824/* See RFC2152 for details. We encode conservatively and decode liberally. */
1825
1826/* Three simple macros defining base-64. */
1827
1828/* Is c a base-64 character? */
1829
1830#define IS_BASE64(c) \
1831 (((c) >= 'A' && (c) <= 'Z') || \
1832 ((c) >= 'a' && (c) <= 'z') || \
1833 ((c) >= '0' && (c) <= '9') || \
1834 (c) == '+' || (c) == '/')
1835
1836/* given that c is a base-64 character, what is its base-64 value? */
1837
1838#define FROM_BASE64(c) \
1839 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1840 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1841 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1842 (c) == '+' ? 62 : 63)
1843
1844/* What is the base-64 character of the bottom 6 bits of n? */
1845
1846#define TO_BASE64(n) \
1847 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1848
1849/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1850 * decoded as itself. We are permissive on decoding; the only ASCII
1851 * byte not decoding to itself is the + which begins a base64
1852 * string. */
1853
1854#define DECODE_DIRECT(c) \
1855 ((c) <= 127 && (c) != '+')
1856
1857/* The UTF-7 encoder treats ASCII characters differently according to
1858 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1859 * the above). See RFC2152. This array identifies these different
1860 * sets:
1861 * 0 : "Set D"
1862 * alphanumeric and '(),-./:?
1863 * 1 : "Set O"
1864 * !"#$%&*;<=>@[]^_`{|}
1865 * 2 : "whitespace"
1866 * ht nl cr sp
1867 * 3 : special (must be base64 encoded)
1868 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1869 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001870
Tim Petersced69f82003-09-16 20:30:58 +00001871static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001872char utf7_category[128] = {
1873/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1874 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1875/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1876 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1877/* sp ! " # $ % & ' ( ) * + , - . / */
1878 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1879/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1880 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1881/* @ A B C D E F G H I J K L M N O */
1882 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1883/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1884 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1885/* ` a b c d e f g h i j k l m n o */
1886 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1887/* p q r s t u v w x y z { | } ~ del */
1888 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001889};
1890
Antoine Pitrou244651a2009-05-04 18:56:13 +00001891/* ENCODE_DIRECT: this character should be encoded as itself. The
1892 * answer depends on whether we are encoding set O as itself, and also
1893 * on whether we are encoding whitespace as itself. RFC2152 makes it
1894 * clear that the answers to these questions vary between
1895 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001896
Antoine Pitrou244651a2009-05-04 18:56:13 +00001897#define ENCODE_DIRECT(c, directO, directWS) \
1898 ((c) < 128 && (c) > 0 && \
1899 ((utf7_category[(c)] == 0) || \
1900 (directWS && (utf7_category[(c)] == 2)) || \
1901 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001902
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 Py_ssize_t size,
1905 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001906{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001907 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1908}
1909
Antoine Pitrou244651a2009-05-04 18:56:13 +00001910/* The decoder. The only state we preserve is our read position,
1911 * i.e. how many characters we have consumed. So if we end in the
1912 * middle of a shift sequence we have to back off the read position
1913 * and the output to the beginning of the sequence, otherwise we lose
1914 * all the shift state (seen bits, number of bits seen, high
1915 * surrogate). */
1916
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001917PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001918 Py_ssize_t size,
1919 const char *errors,
1920 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001921{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001922 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001923 Py_ssize_t startinpos;
1924 Py_ssize_t endinpos;
1925 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001926 const char *e;
1927 PyUnicodeObject *unicode;
1928 Py_UNICODE *p;
1929 const char *errmsg = "";
1930 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001931 Py_UNICODE *shiftOutStart;
1932 unsigned int base64bits = 0;
1933 unsigned long base64buffer = 0;
1934 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001935 PyObject *errorHandler = NULL;
1936 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001937
1938 unicode = _PyUnicode_New(size);
1939 if (!unicode)
1940 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001941 if (size == 0) {
1942 if (consumed)
1943 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001944 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001945 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001946
1947 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00001948 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001949 e = s + size;
1950
1951 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00001953 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001954 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955
Antoine Pitrou244651a2009-05-04 18:56:13 +00001956 if (inShift) { /* in a base-64 section */
1957 if (IS_BASE64(ch)) { /* consume a base-64 character */
1958 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1959 base64bits += 6;
1960 s++;
1961 if (base64bits >= 16) {
1962 /* we have enough bits for a UTF-16 value */
1963 Py_UNICODE outCh = (Py_UNICODE)
1964 (base64buffer >> (base64bits-16));
1965 base64bits -= 16;
1966 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1967 if (surrogate) {
1968 /* expecting a second surrogate */
1969 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1970#ifdef Py_UNICODE_WIDE
1971 *p++ = (((surrogate & 0x3FF)<<10)
1972 | (outCh & 0x3FF)) + 0x10000;
1973#else
1974 *p++ = surrogate;
1975 *p++ = outCh;
1976#endif
1977 surrogate = 0;
1978 }
1979 else {
1980 surrogate = 0;
1981 errmsg = "second surrogate missing";
1982 goto utf7Error;
1983 }
1984 }
1985 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1986 /* first surrogate */
1987 surrogate = outCh;
1988 }
1989 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1990 errmsg = "unexpected second surrogate";
1991 goto utf7Error;
1992 }
1993 else {
1994 *p++ = outCh;
1995 }
1996 }
1997 }
1998 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001999 inShift = 0;
2000 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002001 if (surrogate) {
2002 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002003 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002004 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002005 if (base64bits > 0) { /* left-over bits */
2006 if (base64bits >= 6) {
2007 /* We've seen at least one base-64 character */
2008 errmsg = "partial character in shift sequence";
2009 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002011 else {
2012 /* Some bits remain; they should be zero */
2013 if (base64buffer != 0) {
2014 errmsg = "non-zero padding bits in shift sequence";
2015 goto utf7Error;
2016 }
2017 }
2018 }
2019 if (ch != '-') {
2020 /* '-' is absorbed; other terminating
2021 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002022 *p++ = ch;
2023 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002024 }
2025 }
2026 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002028 s++; /* consume '+' */
2029 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002030 s++;
2031 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002032 }
2033 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002034 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002035 shiftOutStart = p;
2036 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037 }
2038 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002039 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002040 *p++ = ch;
2041 s++;
2042 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002043 else {
2044 startinpos = s-starts;
2045 s++;
2046 errmsg = "unexpected special character";
2047 goto utf7Error;
2048 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002049 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002050utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002051 outpos = p-PyUnicode_AS_UNICODE(unicode);
2052 endinpos = s-starts;
2053 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 errors, &errorHandler,
2055 "utf7", errmsg,
2056 &starts, &e, &startinpos, &endinpos, &exc, &s,
2057 &unicode, &outpos, &p))
2058 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002059 }
2060
Antoine Pitrou244651a2009-05-04 18:56:13 +00002061 /* end of string */
2062
2063 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2064 /* if we're in an inconsistent state, that's an error */
2065 if (surrogate ||
2066 (base64bits >= 6) ||
2067 (base64bits > 0 && base64buffer != 0)) {
2068 outpos = p-PyUnicode_AS_UNICODE(unicode);
2069 endinpos = size;
2070 if (unicode_decode_call_errorhandler(
2071 errors, &errorHandler,
2072 "utf7", "unterminated shift sequence",
2073 &starts, &e, &startinpos, &endinpos, &exc, &s,
2074 &unicode, &outpos, &p))
2075 goto onError;
2076 if (s < e)
2077 goto restart;
2078 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002079 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002080
2081 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002082 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002083 if (inShift) {
2084 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002085 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002086 }
2087 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002088 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002090 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002091
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002092 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002093 goto onError;
2094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002095 Py_XDECREF(errorHandler);
2096 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002097 return (PyObject *)unicode;
2098
Benjamin Peterson29060642009-01-31 22:14:21 +00002099 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100 Py_XDECREF(errorHandler);
2101 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002102 Py_DECREF(unicode);
2103 return NULL;
2104}
2105
2106
2107PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002108 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002109 int base64SetO,
2110 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002111 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002112{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002113 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002115 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002116 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002117 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002118 unsigned int base64bits = 0;
2119 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002120 char * out;
2121 char * start;
2122
2123 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002125
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002126 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002127 return PyErr_NoMemory();
2128
Antoine Pitrou244651a2009-05-04 18:56:13 +00002129 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002130 if (v == NULL)
2131 return NULL;
2132
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002133 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002134 for (;i < size; ++i) {
2135 Py_UNICODE ch = s[i];
2136
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137 if (inShift) {
2138 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2139 /* shifting out */
2140 if (base64bits) { /* output remaining bits */
2141 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2142 base64buffer = 0;
2143 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144 }
2145 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002146 /* Characters not in the BASE64 set implicitly unshift the sequence
2147 so no '-' is required, except if the character is itself a '-' */
2148 if (IS_BASE64(ch) || ch == '-') {
2149 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002150 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002151 *out++ = (char) ch;
2152 }
2153 else {
2154 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002155 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002156 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002157 else { /* not in a shift sequence */
2158 if (ch == '+') {
2159 *out++ = '+';
2160 *out++ = '-';
2161 }
2162 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2163 *out++ = (char) ch;
2164 }
2165 else {
2166 *out++ = '+';
2167 inShift = 1;
2168 goto encode_char;
2169 }
2170 }
2171 continue;
2172encode_char:
2173#ifdef Py_UNICODE_WIDE
2174 if (ch >= 0x10000) {
2175 /* code first surrogate */
2176 base64bits += 16;
2177 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2178 while (base64bits >= 6) {
2179 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2180 base64bits -= 6;
2181 }
2182 /* prepare second surrogate */
2183 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2184 }
2185#endif
2186 base64bits += 16;
2187 base64buffer = (base64buffer << 16) | ch;
2188 while (base64bits >= 6) {
2189 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2190 base64bits -= 6;
2191 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002192 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002193 if (base64bits)
2194 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2195 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002197 if (_PyBytes_Resize(&v, out - start) < 0)
2198 return NULL;
2199 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200}
2201
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202#undef IS_BASE64
2203#undef FROM_BASE64
2204#undef TO_BASE64
2205#undef DECODE_DIRECT
2206#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002207
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208/* --- UTF-8 Codec -------------------------------------------------------- */
2209
Tim Petersced69f82003-09-16 20:30:58 +00002210static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211char utf8_code_length[256] = {
2212 /* Map UTF-8 encoded prefix byte to sequence length. zero means
2213 illegal prefix. see RFC 2279 for details */
2214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2219 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2220 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2221 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2226 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2227 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2228 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2229 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
2230};
2231
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002233 Py_ssize_t size,
2234 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235{
Walter Dörwald69652032004-09-07 20:24:22 +00002236 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2237}
2238
Antoine Pitrouab868312009-01-10 15:40:25 +00002239/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2240#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2241
2242/* Mask to quickly check whether a C 'long' contains a
2243 non-ASCII, UTF8-encoded char. */
2244#if (SIZEOF_LONG == 8)
2245# define ASCII_CHAR_MASK 0x8080808080808080L
2246#elif (SIZEOF_LONG == 4)
2247# define ASCII_CHAR_MASK 0x80808080L
2248#else
2249# error C 'long' size should be either 4 or 8!
2250#endif
2251
Walter Dörwald69652032004-09-07 20:24:22 +00002252PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002253 Py_ssize_t size,
2254 const char *errors,
2255 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002259 Py_ssize_t startinpos;
2260 Py_ssize_t endinpos;
2261 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002262 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 PyUnicodeObject *unicode;
2264 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002265 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002266 PyObject *errorHandler = NULL;
2267 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268
2269 /* Note: size will always be longer than the resulting Unicode
2270 character count */
2271 unicode = _PyUnicode_New(size);
2272 if (!unicode)
2273 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002274 if (size == 0) {
2275 if (consumed)
2276 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002277 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
2280 /* Unpack UTF-8 encoded data */
2281 p = unicode->str;
2282 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002283 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284
2285 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002286 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287
2288 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002289 /* Fast path for runs of ASCII characters. Given that common UTF-8
2290 input will consist of an overwhelming majority of ASCII
2291 characters, we try to optimize for this case by checking
2292 as many characters as a C 'long' can contain.
2293 First, check if we can do an aligned read, as most CPUs have
2294 a penalty for unaligned reads.
2295 */
2296 if (!((size_t) s & LONG_PTR_MASK)) {
2297 /* Help register allocation */
2298 register const char *_s = s;
2299 register Py_UNICODE *_p = p;
2300 while (_s < aligned_end) {
2301 /* Read a whole long at a time (either 4 or 8 bytes),
2302 and do a fast unrolled copy if it only contains ASCII
2303 characters. */
2304 unsigned long data = *(unsigned long *) _s;
2305 if (data & ASCII_CHAR_MASK)
2306 break;
2307 _p[0] = (unsigned char) _s[0];
2308 _p[1] = (unsigned char) _s[1];
2309 _p[2] = (unsigned char) _s[2];
2310 _p[3] = (unsigned char) _s[3];
2311#if (SIZEOF_LONG == 8)
2312 _p[4] = (unsigned char) _s[4];
2313 _p[5] = (unsigned char) _s[5];
2314 _p[6] = (unsigned char) _s[6];
2315 _p[7] = (unsigned char) _s[7];
2316#endif
2317 _s += SIZEOF_LONG;
2318 _p += SIZEOF_LONG;
2319 }
2320 s = _s;
2321 p = _p;
2322 if (s == e)
2323 break;
2324 ch = (unsigned char)*s;
2325 }
2326 }
2327
2328 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002329 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 s++;
2331 continue;
2332 }
2333
2334 n = utf8_code_length[ch];
2335
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002336 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002337 if (consumed)
2338 break;
2339 else {
2340 errmsg = "unexpected end of data";
2341 startinpos = s-starts;
2342 endinpos = size;
2343 goto utf8Error;
2344 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346
2347 switch (n) {
2348
2349 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002350 errmsg = "unexpected code byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002351 startinpos = s-starts;
2352 endinpos = startinpos+1;
2353 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002354
2355 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002356 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002357 startinpos = s-starts;
2358 endinpos = startinpos+1;
2359 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360
2361 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002362 if ((s[1] & 0xc0) != 0x80) {
2363 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 startinpos = s-starts;
2365 endinpos = startinpos+2;
2366 goto utf8Error;
2367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002369 if (ch < 0x80) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 startinpos = s-starts;
2371 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002372 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002373 goto utf8Error;
2374 }
2375 else
2376 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377 break;
2378
2379 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002380 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002381 (s[2] & 0xc0) != 0x80) {
2382 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002383 startinpos = s-starts;
2384 endinpos = startinpos+3;
2385 goto utf8Error;
2386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002388 if (ch < 0x0800 || (ch >= 0xd800 && ch <= 0xDFFF)) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002389 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002390 startinpos = s-starts;
2391 endinpos = startinpos+3;
2392 goto utf8Error;
2393 }
2394 else
2395 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002396 break;
2397
2398 case 4:
2399 if ((s[1] & 0xc0) != 0x80 ||
2400 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002401 (s[3] & 0xc0) != 0x80) {
2402 errmsg = "invalid data";
Benjamin Peterson29060642009-01-31 22:14:21 +00002403 startinpos = s-starts;
2404 endinpos = startinpos+4;
2405 goto utf8Error;
2406 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002407 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Peterson29060642009-01-31 22:14:21 +00002408 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002409 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002410 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Peterson29060642009-01-31 22:14:21 +00002411 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002412 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Peterson29060642009-01-31 22:14:21 +00002413 UTF-16 */
2414 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002415 errmsg = "illegal encoding";
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 startinpos = s-starts;
2417 endinpos = startinpos+4;
2418 goto utf8Error;
2419 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002420#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002422#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002423 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002424
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002425 /* translate from 10000..10FFFF to 0..FFFF */
2426 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002427
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002428 /* high surrogate = top 10 bits added to D800 */
2429 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002430
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002431 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002432 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002433#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 break;
2435
2436 default:
2437 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002438 errmsg = "unsupported Unicode code range";
Benjamin Peterson29060642009-01-31 22:14:21 +00002439 startinpos = s-starts;
2440 endinpos = startinpos+n;
2441 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 }
2443 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002445
Benjamin Peterson29060642009-01-31 22:14:21 +00002446 utf8Error:
2447 outpos = p-PyUnicode_AS_UNICODE(unicode);
2448 if (unicode_decode_call_errorhandler(
2449 errors, &errorHandler,
2450 "utf8", errmsg,
2451 &starts, &e, &startinpos, &endinpos, &exc, &s,
2452 &unicode, &outpos, &p))
2453 goto onError;
2454 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 }
Walter Dörwald69652032004-09-07 20:24:22 +00002456 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002457 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458
2459 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002460 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 goto onError;
2462
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002463 Py_XDECREF(errorHandler);
2464 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 return (PyObject *)unicode;
2466
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002468 Py_XDECREF(errorHandler);
2469 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 Py_DECREF(unicode);
2471 return NULL;
2472}
2473
Antoine Pitrouab868312009-01-10 15:40:25 +00002474#undef ASCII_CHAR_MASK
2475
2476
Tim Peters602f7402002-04-27 18:03:26 +00002477/* Allocation strategy: if the string is short, convert into a stack buffer
2478 and allocate exactly as much space needed at the end. Else allocate the
2479 maximum possible needed (4 result bytes per Unicode character), and return
2480 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002481*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002482PyObject *
2483PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002484 Py_ssize_t size,
2485 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486{
Tim Peters602f7402002-04-27 18:03:26 +00002487#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002488
Guido van Rossum98297ee2007-11-06 21:34:58 +00002489 Py_ssize_t i; /* index into s of next input byte */
2490 PyObject *result; /* result string object */
2491 char *p; /* next free byte in output buffer */
2492 Py_ssize_t nallocated; /* number of result bytes allocated */
2493 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002494 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002495 PyObject *errorHandler = NULL;
2496 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002497
Tim Peters602f7402002-04-27 18:03:26 +00002498 assert(s != NULL);
2499 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500
Tim Peters602f7402002-04-27 18:03:26 +00002501 if (size <= MAX_SHORT_UNICHARS) {
2502 /* Write into the stack buffer; nallocated can't overflow.
2503 * At the end, we'll allocate exactly as much heap space as it
2504 * turns out we need.
2505 */
2506 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002507 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002508 p = stackbuf;
2509 }
2510 else {
2511 /* Overallocate on the heap, and give the excess back at the end. */
2512 nallocated = size * 4;
2513 if (nallocated / 4 != size) /* overflow! */
2514 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002515 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002516 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002517 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002518 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002519 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002520
Tim Peters602f7402002-04-27 18:03:26 +00002521 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002522 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002523
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002524 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002525 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002529 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002530 *p++ = (char)(0xc0 | (ch >> 6));
2531 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002532 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002533 else {
Tim Peters602f7402002-04-27 18:03:26 +00002534 /* Encode UCS2 Unicode ordinals */
2535 if (ch < 0x10000) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002536#ifndef Py_UNICODE_WIDE
Tim Peters602f7402002-04-27 18:03:26 +00002537 /* Special case: check for high surrogate */
2538 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2539 Py_UCS4 ch2 = s[i];
2540 /* Check for low surrogate and combine the two to
2541 form a UCS4 value */
2542 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002543 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002544 i++;
2545 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002546 }
Tim Peters602f7402002-04-27 18:03:26 +00002547 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002548 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002549#endif
2550 if (ch >= 0xd800 && ch <= 0xdfff) {
2551 Py_ssize_t newpos;
2552 PyObject *rep;
2553 char *prep;
2554 int k;
2555 rep = unicode_encode_call_errorhandler
2556 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2557 s, size, &exc, i-1, i, &newpos);
2558 if (!rep)
2559 goto error;
2560 /* Implementation limitations: only support error handler that return
2561 bytes, and only support up to four replacement bytes. */
2562 if (!PyBytes_Check(rep)) {
2563 PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2564 Py_DECREF(rep);
2565 goto error;
2566 }
2567 if (PyBytes_Size(rep) > 4) {
2568 PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2569 Py_DECREF(rep);
2570 goto error;
2571 }
2572 prep = PyBytes_AsString(rep);
2573 for(k = PyBytes_Size(rep); k > 0; k--)
2574 *p++ = *prep++;
2575 Py_DECREF(rep);
2576 continue;
2577
2578 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002579 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002580 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2581 *p++ = (char)(0x80 | (ch & 0x3f));
2582 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 }
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002584#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002585 encodeUCS4:
Benjamin Petersonadf6a6c2009-11-10 21:23:15 +00002586#endif
Tim Peters602f7402002-04-27 18:03:26 +00002587 /* Encode UCS4 Unicode ordinals */
2588 *p++ = (char)(0xf0 | (ch >> 18));
2589 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2590 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2591 *p++ = (char)(0x80 | (ch & 0x3f));
2592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002594
Guido van Rossum98297ee2007-11-06 21:34:58 +00002595 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002596 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002597 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002598 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002599 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002600 }
2601 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002602 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002603 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002604 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002605 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002606 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002607 Py_XDECREF(errorHandler);
2608 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002609 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002610 error:
2611 Py_XDECREF(errorHandler);
2612 Py_XDECREF(exc);
2613 Py_XDECREF(result);
2614 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002615
Tim Peters602f7402002-04-27 18:03:26 +00002616#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617}
2618
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2620{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 if (!PyUnicode_Check(unicode)) {
2622 PyErr_BadArgument();
2623 return NULL;
2624 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002626 PyUnicode_GET_SIZE(unicode),
2627 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628}
2629
Walter Dörwald41980ca2007-08-16 21:55:45 +00002630/* --- UTF-32 Codec ------------------------------------------------------- */
2631
2632PyObject *
2633PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 Py_ssize_t size,
2635 const char *errors,
2636 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002637{
2638 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2639}
2640
2641PyObject *
2642PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002643 Py_ssize_t size,
2644 const char *errors,
2645 int *byteorder,
2646 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002647{
2648 const char *starts = s;
2649 Py_ssize_t startinpos;
2650 Py_ssize_t endinpos;
2651 Py_ssize_t outpos;
2652 PyUnicodeObject *unicode;
2653 Py_UNICODE *p;
2654#ifndef Py_UNICODE_WIDE
2655 int i, pairs;
2656#else
2657 const int pairs = 0;
2658#endif
2659 const unsigned char *q, *e;
2660 int bo = 0; /* assume native ordering by default */
2661 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002662 /* Offsets from q for retrieving bytes in the right order. */
2663#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2664 int iorder[] = {0, 1, 2, 3};
2665#else
2666 int iorder[] = {3, 2, 1, 0};
2667#endif
2668 PyObject *errorHandler = NULL;
2669 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002670 /* On narrow builds we split characters outside the BMP into two
2671 codepoints => count how much extra space we need. */
2672#ifndef Py_UNICODE_WIDE
2673 for (i = pairs = 0; i < size/4; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002674 if (((Py_UCS4 *)s)[i] >= 0x10000)
2675 pairs++;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002676#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002677
2678 /* This might be one to much, because of a BOM */
2679 unicode = _PyUnicode_New((size+3)/4+pairs);
2680 if (!unicode)
2681 return NULL;
2682 if (size == 0)
2683 return (PyObject *)unicode;
2684
2685 /* Unpack UTF-32 encoded data */
2686 p = unicode->str;
2687 q = (unsigned char *)s;
2688 e = q + size;
2689
2690 if (byteorder)
2691 bo = *byteorder;
2692
2693 /* Check for BOM marks (U+FEFF) in the input and adjust current
2694 byte order setting accordingly. In native mode, the leading BOM
2695 mark is skipped, in all other modes, it is copied to the output
2696 stream as-is (giving a ZWNBSP character). */
2697 if (bo == 0) {
2698 if (size >= 4) {
2699 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002701#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002702 if (bom == 0x0000FEFF) {
2703 q += 4;
2704 bo = -1;
2705 }
2706 else if (bom == 0xFFFE0000) {
2707 q += 4;
2708 bo = 1;
2709 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002710#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 if (bom == 0x0000FEFF) {
2712 q += 4;
2713 bo = 1;
2714 }
2715 else if (bom == 0xFFFE0000) {
2716 q += 4;
2717 bo = -1;
2718 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002719#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002721 }
2722
2723 if (bo == -1) {
2724 /* force LE */
2725 iorder[0] = 0;
2726 iorder[1] = 1;
2727 iorder[2] = 2;
2728 iorder[3] = 3;
2729 }
2730 else if (bo == 1) {
2731 /* force BE */
2732 iorder[0] = 3;
2733 iorder[1] = 2;
2734 iorder[2] = 1;
2735 iorder[3] = 0;
2736 }
2737
2738 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002739 Py_UCS4 ch;
2740 /* remaining bytes at the end? (size should be divisible by 4) */
2741 if (e-q<4) {
2742 if (consumed)
2743 break;
2744 errmsg = "truncated data";
2745 startinpos = ((const char *)q)-starts;
2746 endinpos = ((const char *)e)-starts;
2747 goto utf32Error;
2748 /* The remaining input chars are ignored if the callback
2749 chooses to skip the input */
2750 }
2751 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2752 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002753
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 if (ch >= 0x110000)
2755 {
2756 errmsg = "codepoint not in range(0x110000)";
2757 startinpos = ((const char *)q)-starts;
2758 endinpos = startinpos+4;
2759 goto utf32Error;
2760 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002761#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 if (ch >= 0x10000)
2763 {
2764 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2765 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2766 }
2767 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002768#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 *p++ = ch;
2770 q += 4;
2771 continue;
2772 utf32Error:
2773 outpos = p-PyUnicode_AS_UNICODE(unicode);
2774 if (unicode_decode_call_errorhandler(
2775 errors, &errorHandler,
2776 "utf32", errmsg,
2777 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2778 &unicode, &outpos, &p))
2779 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002780 }
2781
2782 if (byteorder)
2783 *byteorder = bo;
2784
2785 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002787
2788 /* Adjust length */
2789 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2790 goto onError;
2791
2792 Py_XDECREF(errorHandler);
2793 Py_XDECREF(exc);
2794 return (PyObject *)unicode;
2795
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002797 Py_DECREF(unicode);
2798 Py_XDECREF(errorHandler);
2799 Py_XDECREF(exc);
2800 return NULL;
2801}
2802
2803PyObject *
2804PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 Py_ssize_t size,
2806 const char *errors,
2807 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002808{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002809 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002810 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002811 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002812#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002813 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002814#else
2815 const int pairs = 0;
2816#endif
2817 /* Offsets from p for storing byte pairs in the right order. */
2818#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2819 int iorder[] = {0, 1, 2, 3};
2820#else
2821 int iorder[] = {3, 2, 1, 0};
2822#endif
2823
Benjamin Peterson29060642009-01-31 22:14:21 +00002824#define STORECHAR(CH) \
2825 do { \
2826 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2827 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2828 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2829 p[iorder[0]] = (CH) & 0xff; \
2830 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002831 } while(0)
2832
2833 /* In narrow builds we can output surrogate pairs as one codepoint,
2834 so we need less space. */
2835#ifndef Py_UNICODE_WIDE
2836 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2838 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2839 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002840#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002841 nsize = (size - pairs + (byteorder == 0));
2842 bytesize = nsize * 4;
2843 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002844 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002845 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002846 if (v == NULL)
2847 return NULL;
2848
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002849 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002850 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002852 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002853 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002854
2855 if (byteorder == -1) {
2856 /* force LE */
2857 iorder[0] = 0;
2858 iorder[1] = 1;
2859 iorder[2] = 2;
2860 iorder[3] = 3;
2861 }
2862 else if (byteorder == 1) {
2863 /* force BE */
2864 iorder[0] = 3;
2865 iorder[1] = 2;
2866 iorder[2] = 1;
2867 iorder[3] = 0;
2868 }
2869
2870 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002871 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002872#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002873 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2874 Py_UCS4 ch2 = *s;
2875 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2876 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2877 s++;
2878 size--;
2879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002880 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002881#endif
2882 STORECHAR(ch);
2883 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002884
2885 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002886 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002887#undef STORECHAR
2888}
2889
2890PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2891{
2892 if (!PyUnicode_Check(unicode)) {
2893 PyErr_BadArgument();
2894 return NULL;
2895 }
2896 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 PyUnicode_GET_SIZE(unicode),
2898 NULL,
2899 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002900}
2901
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902/* --- UTF-16 Codec ------------------------------------------------------- */
2903
Tim Peters772747b2001-08-09 22:21:55 +00002904PyObject *
2905PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 Py_ssize_t size,
2907 const char *errors,
2908 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909{
Walter Dörwald69652032004-09-07 20:24:22 +00002910 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2911}
2912
Antoine Pitrouab868312009-01-10 15:40:25 +00002913/* Two masks for fast checking of whether a C 'long' may contain
2914 UTF16-encoded surrogate characters. This is an efficient heuristic,
2915 assuming that non-surrogate characters with a code point >= 0x8000 are
2916 rare in most input.
2917 FAST_CHAR_MASK is used when the input is in native byte ordering,
2918 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00002919*/
Antoine Pitrouab868312009-01-10 15:40:25 +00002920#if (SIZEOF_LONG == 8)
2921# define FAST_CHAR_MASK 0x8000800080008000L
2922# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2923#elif (SIZEOF_LONG == 4)
2924# define FAST_CHAR_MASK 0x80008000L
2925# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2926#else
2927# error C 'long' size should be either 4 or 8!
2928#endif
2929
Walter Dörwald69652032004-09-07 20:24:22 +00002930PyObject *
2931PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 Py_ssize_t size,
2933 const char *errors,
2934 int *byteorder,
2935 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002936{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002938 Py_ssize_t startinpos;
2939 Py_ssize_t endinpos;
2940 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 PyUnicodeObject *unicode;
2942 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002943 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002944 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002945 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002946 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002947 /* Offsets from q for retrieving byte pairs in the right order. */
2948#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2949 int ihi = 1, ilo = 0;
2950#else
2951 int ihi = 0, ilo = 1;
2952#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 PyObject *errorHandler = NULL;
2954 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955
2956 /* Note: size will always be longer than the resulting Unicode
2957 character count */
2958 unicode = _PyUnicode_New(size);
2959 if (!unicode)
2960 return NULL;
2961 if (size == 0)
2962 return (PyObject *)unicode;
2963
2964 /* Unpack UTF-16 encoded data */
2965 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002966 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002967 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002970 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002972 /* Check for BOM marks (U+FEFF) in the input and adjust current
2973 byte order setting accordingly. In native mode, the leading BOM
2974 mark is skipped, in all other modes, it is copied to the output
2975 stream as-is (giving a ZWNBSP character). */
2976 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002977 if (size >= 2) {
2978 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002979#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 if (bom == 0xFEFF) {
2981 q += 2;
2982 bo = -1;
2983 }
2984 else if (bom == 0xFFFE) {
2985 q += 2;
2986 bo = 1;
2987 }
Tim Petersced69f82003-09-16 20:30:58 +00002988#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 if (bom == 0xFEFF) {
2990 q += 2;
2991 bo = 1;
2992 }
2993 else if (bom == 0xFFFE) {
2994 q += 2;
2995 bo = -1;
2996 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002997#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002998 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000
Tim Peters772747b2001-08-09 22:21:55 +00003001 if (bo == -1) {
3002 /* force LE */
3003 ihi = 1;
3004 ilo = 0;
3005 }
3006 else if (bo == 1) {
3007 /* force BE */
3008 ihi = 0;
3009 ilo = 1;
3010 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003011#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3012 native_ordering = ilo < ihi;
3013#else
3014 native_ordering = ilo > ihi;
3015#endif
Tim Peters772747b2001-08-09 22:21:55 +00003016
Antoine Pitrouab868312009-01-10 15:40:25 +00003017 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003018 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003020 /* First check for possible aligned read of a C 'long'. Unaligned
3021 reads are more expensive, better to defer to another iteration. */
3022 if (!((size_t) q & LONG_PTR_MASK)) {
3023 /* Fast path for runs of non-surrogate chars. */
3024 register const unsigned char *_q = q;
3025 Py_UNICODE *_p = p;
3026 if (native_ordering) {
3027 /* Native ordering is simple: as long as the input cannot
3028 possibly contain a surrogate char, do an unrolled copy
3029 of several 16-bit code points to the target object.
3030 The non-surrogate check is done on several input bytes
3031 at a time (as many as a C 'long' can contain). */
3032 while (_q < aligned_end) {
3033 unsigned long data = * (unsigned long *) _q;
3034 if (data & FAST_CHAR_MASK)
3035 break;
3036 _p[0] = ((unsigned short *) _q)[0];
3037 _p[1] = ((unsigned short *) _q)[1];
3038#if (SIZEOF_LONG == 8)
3039 _p[2] = ((unsigned short *) _q)[2];
3040 _p[3] = ((unsigned short *) _q)[3];
3041#endif
3042 _q += SIZEOF_LONG;
3043 _p += SIZEOF_LONG / 2;
3044 }
3045 }
3046 else {
3047 /* Byteswapped ordering is similar, but we must decompose
3048 the copy bytewise, and take care of zero'ing out the
3049 upper bytes if the target object is in 32-bit units
3050 (that is, in UCS-4 builds). */
3051 while (_q < aligned_end) {
3052 unsigned long data = * (unsigned long *) _q;
3053 if (data & SWAPPED_FAST_CHAR_MASK)
3054 break;
3055 /* Zero upper bytes in UCS-4 builds */
3056#if (Py_UNICODE_SIZE > 2)
3057 _p[0] = 0;
3058 _p[1] = 0;
3059#if (SIZEOF_LONG == 8)
3060 _p[2] = 0;
3061 _p[3] = 0;
3062#endif
3063#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003064 /* Issue #4916; UCS-4 builds on big endian machines must
3065 fill the two last bytes of each 4-byte unit. */
3066#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3067# define OFF 2
3068#else
3069# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003070#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003071 ((unsigned char *) _p)[OFF + 1] = _q[0];
3072 ((unsigned char *) _p)[OFF + 0] = _q[1];
3073 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3074 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3075#if (SIZEOF_LONG == 8)
3076 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3077 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3078 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3079 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3080#endif
3081#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003082 _q += SIZEOF_LONG;
3083 _p += SIZEOF_LONG / 2;
3084 }
3085 }
3086 p = _p;
3087 q = _q;
3088 if (q >= e)
3089 break;
3090 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092
Benjamin Peterson14339b62009-01-31 16:36:08 +00003093 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003094
3095 if (ch < 0xD800 || ch > 0xDFFF) {
3096 *p++ = ch;
3097 continue;
3098 }
3099
3100 /* UTF-16 code pair: */
3101 if (q > e) {
3102 errmsg = "unexpected end of data";
3103 startinpos = (((const char *)q) - 2) - starts;
3104 endinpos = ((const char *)e) + 1 - starts;
3105 goto utf16Error;
3106 }
3107 if (0xD800 <= ch && ch <= 0xDBFF) {
3108 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3109 q += 2;
3110 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003111#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 *p++ = ch;
3113 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003114#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003115 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003116#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003117 continue;
3118 }
3119 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003120 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 startinpos = (((const char *)q)-4)-starts;
3122 endinpos = startinpos+2;
3123 goto utf16Error;
3124 }
3125
Benjamin Peterson14339b62009-01-31 16:36:08 +00003126 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 errmsg = "illegal encoding";
3128 startinpos = (((const char *)q)-2)-starts;
3129 endinpos = startinpos+2;
3130 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003131
Benjamin Peterson29060642009-01-31 22:14:21 +00003132 utf16Error:
3133 outpos = p - PyUnicode_AS_UNICODE(unicode);
3134 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003135 errors,
3136 &errorHandler,
3137 "utf16", errmsg,
3138 &starts,
3139 (const char **)&e,
3140 &startinpos,
3141 &endinpos,
3142 &exc,
3143 (const char **)&q,
3144 &unicode,
3145 &outpos,
3146 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003147 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003149 /* remaining byte at the end? (size should be even) */
3150 if (e == q) {
3151 if (!consumed) {
3152 errmsg = "truncated data";
3153 startinpos = ((const char *)q) - starts;
3154 endinpos = ((const char *)e) + 1 - starts;
3155 outpos = p - PyUnicode_AS_UNICODE(unicode);
3156 if (unicode_decode_call_errorhandler(
3157 errors,
3158 &errorHandler,
3159 "utf16", errmsg,
3160 &starts,
3161 (const char **)&e,
3162 &startinpos,
3163 &endinpos,
3164 &exc,
3165 (const char **)&q,
3166 &unicode,
3167 &outpos,
3168 &p))
3169 goto onError;
3170 /* The remaining input chars are ignored if the callback
3171 chooses to skip the input */
3172 }
3173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174
3175 if (byteorder)
3176 *byteorder = bo;
3177
Walter Dörwald69652032004-09-07 20:24:22 +00003178 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003180
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003182 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 goto onError;
3184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 Py_XDECREF(errorHandler);
3186 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 return (PyObject *)unicode;
3188
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003190 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003191 Py_XDECREF(errorHandler);
3192 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 return NULL;
3194}
3195
Antoine Pitrouab868312009-01-10 15:40:25 +00003196#undef FAST_CHAR_MASK
3197#undef SWAPPED_FAST_CHAR_MASK
3198
Tim Peters772747b2001-08-09 22:21:55 +00003199PyObject *
3200PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 Py_ssize_t size,
3202 const char *errors,
3203 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003205 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003206 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003207 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003208#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003209 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003210#else
3211 const int pairs = 0;
3212#endif
Tim Peters772747b2001-08-09 22:21:55 +00003213 /* Offsets from p for storing byte pairs in the right order. */
3214#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3215 int ihi = 1, ilo = 0;
3216#else
3217 int ihi = 0, ilo = 1;
3218#endif
3219
Benjamin Peterson29060642009-01-31 22:14:21 +00003220#define STORECHAR(CH) \
3221 do { \
3222 p[ihi] = ((CH) >> 8) & 0xff; \
3223 p[ilo] = (CH) & 0xff; \
3224 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003225 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003227#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003228 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 if (s[i] >= 0x10000)
3230 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003231#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003232 /* 2 * (size + pairs + (byteorder == 0)) */
3233 if (size > PY_SSIZE_T_MAX ||
3234 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003236 nsize = size + pairs + (byteorder == 0);
3237 bytesize = nsize * 2;
3238 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003240 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 if (v == NULL)
3242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003244 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003247 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003248 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003249
3250 if (byteorder == -1) {
3251 /* force LE */
3252 ihi = 1;
3253 ilo = 0;
3254 }
3255 else if (byteorder == 1) {
3256 /* force BE */
3257 ihi = 0;
3258 ilo = 1;
3259 }
3260
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003261 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 Py_UNICODE ch = *s++;
3263 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003264#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 if (ch >= 0x10000) {
3266 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3267 ch = 0xD800 | ((ch-0x10000) >> 10);
3268 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003269#endif
Tim Peters772747b2001-08-09 22:21:55 +00003270 STORECHAR(ch);
3271 if (ch2)
3272 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003273 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003274
3275 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003276 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003277#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003278}
3279
3280PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3281{
3282 if (!PyUnicode_Check(unicode)) {
3283 PyErr_BadArgument();
3284 return NULL;
3285 }
3286 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 PyUnicode_GET_SIZE(unicode),
3288 NULL,
3289 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290}
3291
3292/* --- Unicode Escape Codec ----------------------------------------------- */
3293
Fredrik Lundh06d12682001-01-24 07:59:11 +00003294static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003295
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 Py_ssize_t size,
3298 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003301 Py_ssize_t startinpos;
3302 Py_ssize_t endinpos;
3303 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003304 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003308 char* message;
3309 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003310 PyObject *errorHandler = NULL;
3311 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003312
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 /* Escaped strings will always be longer than the resulting
3314 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315 length after conversion to the true value.
3316 (but if the error callback returns a long replacement string
3317 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 v = _PyUnicode_New(size);
3319 if (v == NULL)
3320 goto onError;
3321 if (size == 0)
3322 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 while (s < end) {
3328 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003329 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331
3332 /* Non-escape characters are interpreted as Unicode ordinals */
3333 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003334 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 continue;
3336 }
3337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 /* \ - Escapes */
3340 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003341 c = *s++;
3342 if (s > end)
3343 c = '\0'; /* Invalid after \ */
3344 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003345
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 case '\n': break;
3348 case '\\': *p++ = '\\'; break;
3349 case '\'': *p++ = '\''; break;
3350 case '\"': *p++ = '\"'; break;
3351 case 'b': *p++ = '\b'; break;
3352 case 'f': *p++ = '\014'; break; /* FF */
3353 case 't': *p++ = '\t'; break;
3354 case 'n': *p++ = '\n'; break;
3355 case 'r': *p++ = '\r'; break;
3356 case 'v': *p++ = '\013'; break; /* VT */
3357 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3358
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 case '0': case '1': case '2': case '3':
3361 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003362 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003363 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003364 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003365 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003366 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003368 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 break;
3370
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 /* hex escapes */
3372 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003374 digits = 2;
3375 message = "truncated \\xXX escape";
3376 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003380 digits = 4;
3381 message = "truncated \\uXXXX escape";
3382 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003385 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003386 digits = 8;
3387 message = "truncated \\UXXXXXXXX escape";
3388 hexescape:
3389 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 outpos = p-PyUnicode_AS_UNICODE(v);
3391 if (s+digits>end) {
3392 endinpos = size;
3393 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 errors, &errorHandler,
3395 "unicodeescape", "end of string in escape sequence",
3396 &starts, &end, &startinpos, &endinpos, &exc, &s,
3397 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 goto onError;
3399 goto nextByte;
3400 }
3401 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003402 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003403 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 endinpos = (s+i+1)-starts;
3405 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003406 errors, &errorHandler,
3407 "unicodeescape", message,
3408 &starts, &end, &startinpos, &endinpos, &exc, &s,
3409 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003410 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003412 }
3413 chr = (chr<<4) & ~0xF;
3414 if (c >= '0' && c <= '9')
3415 chr += c - '0';
3416 else if (c >= 'a' && c <= 'f')
3417 chr += 10 + c - 'a';
3418 else
3419 chr += 10 + c - 'A';
3420 }
3421 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003422 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 /* _decoding_error will have already written into the
3424 target buffer. */
3425 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003426 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003427 /* when we get here, chr is a 32-bit unicode character */
3428 if (chr <= 0xffff)
3429 /* UCS-2 character */
3430 *p++ = (Py_UNICODE) chr;
3431 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003432 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003433 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003434#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003435 *p++ = chr;
3436#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003437 chr -= 0x10000L;
3438 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003439 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003440#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003441 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 endinpos = s-starts;
3443 outpos = p-PyUnicode_AS_UNICODE(v);
3444 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 errors, &errorHandler,
3446 "unicodeescape", "illegal Unicode character",
3447 &starts, &end, &startinpos, &endinpos, &exc, &s,
3448 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003449 goto onError;
3450 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003451 break;
3452
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003454 case 'N':
3455 message = "malformed \\N character escape";
3456 if (ucnhash_CAPI == NULL) {
3457 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003458 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003459 if (ucnhash_CAPI == NULL)
3460 goto ucnhashError;
3461 }
3462 if (*s == '{') {
3463 const char *start = s+1;
3464 /* look for the closing brace */
3465 while (*s != '}' && s < end)
3466 s++;
3467 if (s > start && s < end && *s == '}') {
3468 /* found a name. look it up in the unicode database */
3469 message = "unknown Unicode character name";
3470 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003471 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003472 goto store;
3473 }
3474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 endinpos = s-starts;
3476 outpos = p-PyUnicode_AS_UNICODE(v);
3477 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003478 errors, &errorHandler,
3479 "unicodeescape", message,
3480 &starts, &end, &startinpos, &endinpos, &exc, &s,
3481 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003482 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003483 break;
3484
3485 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003486 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 message = "\\ at end of string";
3488 s--;
3489 endinpos = s-starts;
3490 outpos = p-PyUnicode_AS_UNICODE(v);
3491 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003492 errors, &errorHandler,
3493 "unicodeescape", message,
3494 &starts, &end, &startinpos, &endinpos, &exc, &s,
3495 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003496 goto onError;
3497 }
3498 else {
3499 *p++ = '\\';
3500 *p++ = (unsigned char)s[-1];
3501 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003502 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003507 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003509 Py_XDECREF(errorHandler);
3510 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003512
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003514 PyErr_SetString(
3515 PyExc_UnicodeError,
3516 "\\N escapes not supported (can't load unicodedata module)"
3517 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003518 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 Py_XDECREF(errorHandler);
3520 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003521 return NULL;
3522
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 Py_XDECREF(errorHandler);
3526 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 return NULL;
3528}
3529
3530/* Return a Unicode-Escape string version of the Unicode object.
3531
3532 If quotes is true, the string is enclosed in u"" or u'' quotes as
3533 appropriate.
3534
3535*/
3536
Thomas Wouters477c8d52006-05-27 19:21:47 +00003537Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003538 Py_ssize_t size,
3539 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003540{
3541 /* like wcschr, but doesn't stop at NULL characters */
3542
3543 while (size-- > 0) {
3544 if (*s == ch)
3545 return s;
3546 s++;
3547 }
3548
3549 return NULL;
3550}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003551
Walter Dörwald79e913e2007-05-12 11:08:06 +00003552static const char *hexdigits = "0123456789abcdef";
3553
3554PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003555 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003557 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003560#ifdef Py_UNICODE_WIDE
3561 const Py_ssize_t expandsize = 10;
3562#else
3563 const Py_ssize_t expandsize = 6;
3564#endif
3565
Thomas Wouters89f507f2006-12-13 04:49:30 +00003566 /* XXX(nnorwitz): rather than over-allocating, it would be
3567 better to choose a different scheme. Perhaps scan the
3568 first N-chars of the string and allocate based on that size.
3569 */
3570 /* Initial allocation is based on the longest-possible unichr
3571 escape.
3572
3573 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3574 unichr, so in this case it's the longest unichr escape. In
3575 narrow (UTF-16) builds this is five chars per source unichr
3576 since there are two unichrs in the surrogate pair, so in narrow
3577 (UTF-16) builds it's not the longest unichr escape.
3578
3579 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3580 so in the narrow (UTF-16) build case it's the longest unichr
3581 escape.
3582 */
3583
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003584 if (size == 0)
3585 return PyBytes_FromStringAndSize(NULL, 0);
3586
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003587 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003589
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003590 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 2
3592 + expandsize*size
3593 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 if (repr == NULL)
3595 return NULL;
3596
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003597 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 while (size-- > 0) {
3600 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003601
Walter Dörwald79e913e2007-05-12 11:08:06 +00003602 /* Escape backslashes */
3603 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 *p++ = '\\';
3605 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003606 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003607 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003608
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003609#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003610 /* Map 21-bit characters to '\U00xxxxxx' */
3611 else if (ch >= 0x10000) {
3612 *p++ = '\\';
3613 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003614 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3615 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3616 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3617 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3618 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3619 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3620 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3621 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003623 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003624#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003625 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3626 else if (ch >= 0xD800 && ch < 0xDC00) {
3627 Py_UNICODE ch2;
3628 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003629
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 ch2 = *s++;
3631 size--;
3632 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3633 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3634 *p++ = '\\';
3635 *p++ = 'U';
3636 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3637 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3638 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3639 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3640 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3641 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3642 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3643 *p++ = hexdigits[ucs & 0x0000000F];
3644 continue;
3645 }
3646 /* Fall through: isolated surrogates are copied as-is */
3647 s--;
3648 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003649 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003650#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003653 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 *p++ = '\\';
3655 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003656 *p++ = hexdigits[(ch >> 12) & 0x000F];
3657 *p++ = hexdigits[(ch >> 8) & 0x000F];
3658 *p++ = hexdigits[(ch >> 4) & 0x000F];
3659 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003661
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003662 /* Map special whitespace to '\t', \n', '\r' */
3663 else if (ch == '\t') {
3664 *p++ = '\\';
3665 *p++ = 't';
3666 }
3667 else if (ch == '\n') {
3668 *p++ = '\\';
3669 *p++ = 'n';
3670 }
3671 else if (ch == '\r') {
3672 *p++ = '\\';
3673 *p++ = 'r';
3674 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003675
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003676 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003677 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003679 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003680 *p++ = hexdigits[(ch >> 4) & 0x000F];
3681 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003682 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003683
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 /* Copy everything else as-is */
3685 else
3686 *p++ = (char) ch;
3687 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003689 assert(p - PyBytes_AS_STRING(repr) > 0);
3690 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3691 return NULL;
3692 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693}
3694
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003695PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003697 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 if (!PyUnicode_Check(unicode)) {
3699 PyErr_BadArgument();
3700 return NULL;
3701 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003702 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3703 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003704 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705}
3706
3707/* --- Raw Unicode Escape Codec ------------------------------------------- */
3708
3709PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 Py_ssize_t size,
3711 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003714 Py_ssize_t startinpos;
3715 Py_ssize_t endinpos;
3716 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 const char *end;
3720 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 PyObject *errorHandler = NULL;
3722 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 /* Escaped strings will always be longer than the resulting
3725 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 length after conversion to the true value. (But decoding error
3727 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 v = _PyUnicode_New(size);
3729 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003730 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 end = s + size;
3735 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 unsigned char c;
3737 Py_UCS4 x;
3738 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003739 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740
Benjamin Peterson29060642009-01-31 22:14:21 +00003741 /* Non-escape characters are interpreted as Unicode ordinals */
3742 if (*s != '\\') {
3743 *p++ = (unsigned char)*s++;
3744 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003745 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 startinpos = s-starts;
3747
3748 /* \u-escapes are only interpreted iff the number of leading
3749 backslashes if odd */
3750 bs = s;
3751 for (;s < end;) {
3752 if (*s != '\\')
3753 break;
3754 *p++ = (unsigned char)*s++;
3755 }
3756 if (((s - bs) & 1) == 0 ||
3757 s >= end ||
3758 (*s != 'u' && *s != 'U')) {
3759 continue;
3760 }
3761 p--;
3762 count = *s=='u' ? 4 : 8;
3763 s++;
3764
3765 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3766 outpos = p-PyUnicode_AS_UNICODE(v);
3767 for (x = 0, i = 0; i < count; ++i, ++s) {
3768 c = (unsigned char)*s;
3769 if (!ISXDIGIT(c)) {
3770 endinpos = s-starts;
3771 if (unicode_decode_call_errorhandler(
3772 errors, &errorHandler,
3773 "rawunicodeescape", "truncated \\uXXXX",
3774 &starts, &end, &startinpos, &endinpos, &exc, &s,
3775 &v, &outpos, &p))
3776 goto onError;
3777 goto nextByte;
3778 }
3779 x = (x<<4) & ~0xF;
3780 if (c >= '0' && c <= '9')
3781 x += c - '0';
3782 else if (c >= 'a' && c <= 'f')
3783 x += 10 + c - 'a';
3784 else
3785 x += 10 + c - 'A';
3786 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003787 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 /* UCS-2 character */
3789 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003790 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 /* UCS-4 character. Either store directly, or as
3792 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003793#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003795#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003796 x -= 0x10000L;
3797 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3798 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003799#endif
3800 } else {
3801 endinpos = s-starts;
3802 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003803 if (unicode_decode_call_errorhandler(
3804 errors, &errorHandler,
3805 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003806 &starts, &end, &startinpos, &endinpos, &exc, &s,
3807 &v, &outpos, &p))
3808 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003809 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003810 nextByte:
3811 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003813 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 Py_XDECREF(errorHandler);
3816 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003818
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 Py_XDECREF(errorHandler);
3822 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 return NULL;
3824}
3825
3826PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003829 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 char *p;
3831 char *q;
3832
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003833#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003834 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003835#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003836 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003837#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003838
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003839 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003841
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003842 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 if (repr == NULL)
3844 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003845 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003846 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003848 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 while (size-- > 0) {
3850 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003851#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003852 /* Map 32-bit characters to '\Uxxxxxxxx' */
3853 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003854 *p++ = '\\';
3855 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003856 *p++ = hexdigits[(ch >> 28) & 0xf];
3857 *p++ = hexdigits[(ch >> 24) & 0xf];
3858 *p++ = hexdigits[(ch >> 20) & 0xf];
3859 *p++ = hexdigits[(ch >> 16) & 0xf];
3860 *p++ = hexdigits[(ch >> 12) & 0xf];
3861 *p++ = hexdigits[(ch >> 8) & 0xf];
3862 *p++ = hexdigits[(ch >> 4) & 0xf];
3863 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003864 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003865 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003866#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3868 if (ch >= 0xD800 && ch < 0xDC00) {
3869 Py_UNICODE ch2;
3870 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003871
Benjamin Peterson29060642009-01-31 22:14:21 +00003872 ch2 = *s++;
3873 size--;
3874 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3875 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3876 *p++ = '\\';
3877 *p++ = 'U';
3878 *p++ = hexdigits[(ucs >> 28) & 0xf];
3879 *p++ = hexdigits[(ucs >> 24) & 0xf];
3880 *p++ = hexdigits[(ucs >> 20) & 0xf];
3881 *p++ = hexdigits[(ucs >> 16) & 0xf];
3882 *p++ = hexdigits[(ucs >> 12) & 0xf];
3883 *p++ = hexdigits[(ucs >> 8) & 0xf];
3884 *p++ = hexdigits[(ucs >> 4) & 0xf];
3885 *p++ = hexdigits[ucs & 0xf];
3886 continue;
3887 }
3888 /* Fall through: isolated surrogates are copied as-is */
3889 s--;
3890 size++;
3891 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003892#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 /* Map 16-bit characters to '\uxxxx' */
3894 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 *p++ = '\\';
3896 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003897 *p++ = hexdigits[(ch >> 12) & 0xf];
3898 *p++ = hexdigits[(ch >> 8) & 0xf];
3899 *p++ = hexdigits[(ch >> 4) & 0xf];
3900 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003902 /* Copy everything else as-is */
3903 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 *p++ = (char) ch;
3905 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003906 size = p - q;
3907
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003908 assert(size > 0);
3909 if (_PyBytes_Resize(&repr, size) < 0)
3910 return NULL;
3911 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912}
3913
3914PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3915{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003916 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003918 PyErr_BadArgument();
3919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003921 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3922 PyUnicode_GET_SIZE(unicode));
3923
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003924 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925}
3926
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003927/* --- Unicode Internal Codec ------------------------------------------- */
3928
3929PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 Py_ssize_t size,
3931 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003932{
3933 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003934 Py_ssize_t startinpos;
3935 Py_ssize_t endinpos;
3936 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003937 PyUnicodeObject *v;
3938 Py_UNICODE *p;
3939 const char *end;
3940 const char *reason;
3941 PyObject *errorHandler = NULL;
3942 PyObject *exc = NULL;
3943
Neal Norwitzd43069c2006-01-08 01:12:10 +00003944#ifdef Py_UNICODE_WIDE
3945 Py_UNICODE unimax = PyUnicode_GetMax();
3946#endif
3947
Thomas Wouters89f507f2006-12-13 04:49:30 +00003948 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003949 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3950 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003952 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003954 p = PyUnicode_AS_UNICODE(v);
3955 end = s + size;
3956
3957 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003958 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003959 /* We have to sanity check the raw data, otherwise doom looms for
3960 some malformed UCS-4 data. */
3961 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00003962#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003963 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00003964#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003965 end-s < Py_UNICODE_SIZE
3966 )
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003968 startinpos = s - starts;
3969 if (end-s < Py_UNICODE_SIZE) {
3970 endinpos = end-starts;
3971 reason = "truncated input";
3972 }
3973 else {
3974 endinpos = s - starts + Py_UNICODE_SIZE;
3975 reason = "illegal code point (> 0x10FFFF)";
3976 }
3977 outpos = p - PyUnicode_AS_UNICODE(v);
3978 if (unicode_decode_call_errorhandler(
3979 errors, &errorHandler,
3980 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003981 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003982 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003983 goto onError;
3984 }
3985 }
3986 else {
3987 p++;
3988 s += Py_UNICODE_SIZE;
3989 }
3990 }
3991
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003992 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003993 goto onError;
3994 Py_XDECREF(errorHandler);
3995 Py_XDECREF(exc);
3996 return (PyObject *)v;
3997
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003999 Py_XDECREF(v);
4000 Py_XDECREF(errorHandler);
4001 Py_XDECREF(exc);
4002 return NULL;
4003}
4004
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005/* --- Latin-1 Codec ------------------------------------------------------ */
4006
4007PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 Py_ssize_t size,
4009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010{
4011 PyUnicodeObject *v;
4012 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004013 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004014
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004016 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 Py_UNICODE r = *(unsigned char*)s;
4018 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004019 }
4020
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 v = _PyUnicode_New(size);
4022 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004027 e = s + size;
4028 /* Unrolling the copy makes it much faster by reducing the looping
4029 overhead. This is similar to what many memcpy() implementations do. */
4030 unrolled_end = e - 4;
4031 while (s < unrolled_end) {
4032 p[0] = (unsigned char) s[0];
4033 p[1] = (unsigned char) s[1];
4034 p[2] = (unsigned char) s[2];
4035 p[3] = (unsigned char) s[3];
4036 s += 4;
4037 p += 4;
4038 }
4039 while (s < e)
4040 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004042
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 Py_XDECREF(v);
4045 return NULL;
4046}
4047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048/* create or adjust a UnicodeEncodeError */
4049static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 const char *encoding,
4051 const Py_UNICODE *unicode, Py_ssize_t size,
4052 Py_ssize_t startpos, Py_ssize_t endpos,
4053 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 *exceptionObject = PyUnicodeEncodeError_Create(
4057 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 }
4059 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4061 goto onError;
4062 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4063 goto onError;
4064 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4065 goto onError;
4066 return;
4067 onError:
4068 Py_DECREF(*exceptionObject);
4069 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 }
4071}
4072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004073/* raises a UnicodeEncodeError */
4074static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 const char *encoding,
4076 const Py_UNICODE *unicode, Py_ssize_t size,
4077 Py_ssize_t startpos, Py_ssize_t endpos,
4078 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079{
4080 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004083 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084}
4085
4086/* error handling callback helper:
4087 build arguments, call the callback and check the arguments,
4088 put the result into newpos and return the replacement string, which
4089 has to be freed by the caller */
4090static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 PyObject **errorHandler,
4092 const char *encoding, const char *reason,
4093 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4094 Py_ssize_t startpos, Py_ssize_t endpos,
4095 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004096{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004097 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098
4099 PyObject *restuple;
4100 PyObject *resunicode;
4101
4102 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 }
4107
4108 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112
4113 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004118 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 Py_DECREF(restuple);
4120 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004122 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 &resunicode, newpos)) {
4124 Py_DECREF(restuple);
4125 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004127 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4128 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4129 Py_DECREF(restuple);
4130 return NULL;
4131 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004134 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4136 Py_DECREF(restuple);
4137 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004138 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 Py_INCREF(resunicode);
4140 Py_DECREF(restuple);
4141 return resunicode;
4142}
4143
4144static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 Py_ssize_t size,
4146 const char *errors,
4147 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148{
4149 /* output object */
4150 PyObject *res;
4151 /* pointers to the beginning and end+1 of input */
4152 const Py_UNICODE *startp = p;
4153 const Py_UNICODE *endp = p + size;
4154 /* pointer to the beginning of the unencodable characters */
4155 /* const Py_UNICODE *badp = NULL; */
4156 /* pointer into the output */
4157 char *str;
4158 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004159 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004160 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4161 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 PyObject *errorHandler = NULL;
4163 PyObject *exc = NULL;
4164 /* the following variable is used for caching string comparisons
4165 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4166 int known_errorHandler = -1;
4167
4168 /* allocate enough for a simple encoding without
4169 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004170 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004171 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004172 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004174 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004175 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 ressize = size;
4177
4178 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 /* can we encode this? */
4182 if (c<limit) {
4183 /* no overflow check, because we know that the space is enough */
4184 *str++ = (char)c;
4185 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004186 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 else {
4188 Py_ssize_t unicodepos = p-startp;
4189 Py_ssize_t requiredsize;
4190 PyObject *repunicode;
4191 Py_ssize_t repsize;
4192 Py_ssize_t newpos;
4193 Py_ssize_t respos;
4194 Py_UNICODE *uni2;
4195 /* startpos for collecting unencodable chars */
4196 const Py_UNICODE *collstart = p;
4197 const Py_UNICODE *collend = p;
4198 /* find all unecodable characters */
4199 while ((collend < endp) && ((*collend)>=limit))
4200 ++collend;
4201 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4202 if (known_errorHandler==-1) {
4203 if ((errors==NULL) || (!strcmp(errors, "strict")))
4204 known_errorHandler = 1;
4205 else if (!strcmp(errors, "replace"))
4206 known_errorHandler = 2;
4207 else if (!strcmp(errors, "ignore"))
4208 known_errorHandler = 3;
4209 else if (!strcmp(errors, "xmlcharrefreplace"))
4210 known_errorHandler = 4;
4211 else
4212 known_errorHandler = 0;
4213 }
4214 switch (known_errorHandler) {
4215 case 1: /* strict */
4216 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4217 goto onError;
4218 case 2: /* replace */
4219 while (collstart++<collend)
4220 *str++ = '?'; /* fall through */
4221 case 3: /* ignore */
4222 p = collend;
4223 break;
4224 case 4: /* xmlcharrefreplace */
4225 respos = str - PyBytes_AS_STRING(res);
4226 /* determine replacement size (temporarily (mis)uses p) */
4227 for (p = collstart, repsize = 0; p < collend; ++p) {
4228 if (*p<10)
4229 repsize += 2+1+1;
4230 else if (*p<100)
4231 repsize += 2+2+1;
4232 else if (*p<1000)
4233 repsize += 2+3+1;
4234 else if (*p<10000)
4235 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004236#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 else
4238 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004239#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 else if (*p<100000)
4241 repsize += 2+5+1;
4242 else if (*p<1000000)
4243 repsize += 2+6+1;
4244 else
4245 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004246#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 }
4248 requiredsize = respos+repsize+(endp-collend);
4249 if (requiredsize > ressize) {
4250 if (requiredsize<2*ressize)
4251 requiredsize = 2*ressize;
4252 if (_PyBytes_Resize(&res, requiredsize))
4253 goto onError;
4254 str = PyBytes_AS_STRING(res) + respos;
4255 ressize = requiredsize;
4256 }
4257 /* generate replacement (temporarily (mis)uses p) */
4258 for (p = collstart; p < collend; ++p) {
4259 str += sprintf(str, "&#%d;", (int)*p);
4260 }
4261 p = collend;
4262 break;
4263 default:
4264 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4265 encoding, reason, startp, size, &exc,
4266 collstart-startp, collend-startp, &newpos);
4267 if (repunicode == NULL)
4268 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004269 if (PyBytes_Check(repunicode)) {
4270 /* Directly copy bytes result to output. */
4271 repsize = PyBytes_Size(repunicode);
4272 if (repsize > 1) {
4273 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004274 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004275 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4276 Py_DECREF(repunicode);
4277 goto onError;
4278 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004279 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004280 ressize += repsize-1;
4281 }
4282 memcpy(str, PyBytes_AsString(repunicode), repsize);
4283 str += repsize;
4284 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004285 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004286 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004287 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 /* need more space? (at least enough for what we
4289 have+the replacement+the rest of the string, so
4290 we won't have to check space for encodable characters) */
4291 respos = str - PyBytes_AS_STRING(res);
4292 repsize = PyUnicode_GET_SIZE(repunicode);
4293 requiredsize = respos+repsize+(endp-collend);
4294 if (requiredsize > ressize) {
4295 if (requiredsize<2*ressize)
4296 requiredsize = 2*ressize;
4297 if (_PyBytes_Resize(&res, requiredsize)) {
4298 Py_DECREF(repunicode);
4299 goto onError;
4300 }
4301 str = PyBytes_AS_STRING(res) + respos;
4302 ressize = requiredsize;
4303 }
4304 /* check if there is anything unencodable in the replacement
4305 and copy it to the output */
4306 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4307 c = *uni2;
4308 if (c >= limit) {
4309 raise_encode_exception(&exc, encoding, startp, size,
4310 unicodepos, unicodepos+1, reason);
4311 Py_DECREF(repunicode);
4312 goto onError;
4313 }
4314 *str = (char)c;
4315 }
4316 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004317 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004318 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004319 }
4320 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004321 /* Resize if we allocated to much */
4322 size = str - PyBytes_AS_STRING(res);
4323 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004324 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004325 if (_PyBytes_Resize(&res, size) < 0)
4326 goto onError;
4327 }
4328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 Py_XDECREF(errorHandler);
4330 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004331 return res;
4332
4333 onError:
4334 Py_XDECREF(res);
4335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
4337 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338}
4339
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 Py_ssize_t size,
4342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004344 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345}
4346
4347PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4348{
4349 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 PyErr_BadArgument();
4351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352 }
4353 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 PyUnicode_GET_SIZE(unicode),
4355 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356}
4357
4358/* --- 7-bit ASCII Codec -------------------------------------------------- */
4359
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 Py_ssize_t size,
4362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 PyUnicodeObject *v;
4366 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004367 Py_ssize_t startinpos;
4368 Py_ssize_t endinpos;
4369 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 const char *e;
4371 PyObject *errorHandler = NULL;
4372 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004373
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004375 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 Py_UNICODE r = *(unsigned char*)s;
4377 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004378 }
Tim Petersced69f82003-09-16 20:30:58 +00004379
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 v = _PyUnicode_New(size);
4381 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 e = s + size;
4387 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 register unsigned char c = (unsigned char)*s;
4389 if (c < 128) {
4390 *p++ = c;
4391 ++s;
4392 }
4393 else {
4394 startinpos = s-starts;
4395 endinpos = startinpos + 1;
4396 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4397 if (unicode_decode_call_errorhandler(
4398 errors, &errorHandler,
4399 "ascii", "ordinal not in range(128)",
4400 &starts, &e, &startinpos, &endinpos, &exc, &s,
4401 &v, &outpos, &p))
4402 goto onError;
4403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004405 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4407 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 Py_XDECREF(errorHandler);
4409 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004411
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 Py_XDECREF(errorHandler);
4415 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 return NULL;
4417}
4418
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 Py_ssize_t size,
4421 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424}
4425
4426PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4427{
4428 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 PyErr_BadArgument();
4430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 }
4432 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 PyUnicode_GET_SIZE(unicode),
4434 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435}
4436
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004437#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004438
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004439/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004440
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004441#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004442#define NEED_RETRY
4443#endif
4444
4445/* XXX This code is limited to "true" double-byte encodings, as
4446 a) it assumes an incomplete character consists of a single byte, and
4447 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004449
4450static int is_dbcs_lead_byte(const char *s, int offset)
4451{
4452 const char *curr = s + offset;
4453
4454 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 const char *prev = CharPrev(s, curr);
4456 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004457 }
4458 return 0;
4459}
4460
4461/*
4462 * Decode MBCS string into unicode object. If 'final' is set, converts
4463 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4464 */
4465static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 const char *s, /* MBCS string */
4467 int size, /* sizeof MBCS string */
4468 int final)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004469{
4470 Py_UNICODE *p;
4471 Py_ssize_t n = 0;
4472 int usize = 0;
4473
4474 assert(size >= 0);
4475
4476 /* Skip trailing lead-byte unless 'final' is set */
4477 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004479
4480 /* First get the size of the result */
4481 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4483 if (usize == 0) {
4484 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4485 return -1;
4486 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004487 }
4488
4489 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 /* Create unicode object */
4491 *v = _PyUnicode_New(usize);
4492 if (*v == NULL)
4493 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004494 }
4495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 /* Extend unicode object */
4497 n = PyUnicode_GET_SIZE(*v);
4498 if (_PyUnicode_Resize(v, n + usize) < 0)
4499 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004500 }
4501
4502 /* Do the conversion */
4503 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 p = PyUnicode_AS_UNICODE(*v) + n;
4505 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4506 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4507 return -1;
4508 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004509 }
4510
4511 return size;
4512}
4513
4514PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 Py_ssize_t size,
4516 const char *errors,
4517 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004518{
4519 PyUnicodeObject *v = NULL;
4520 int done;
4521
4522 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004524
4525#ifdef NEED_RETRY
4526 retry:
4527 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 done = decode_mbcs(&v, s, INT_MAX, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004529 else
4530#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 done = decode_mbcs(&v, s, (int)size, !consumed);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004532
4533 if (done < 0) {
4534 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004536 }
4537
4538 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004540
4541#ifdef NEED_RETRY
4542 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 s += done;
4544 size -= done;
4545 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004546 }
4547#endif
4548
4549 return (PyObject *)v;
4550}
4551
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004552PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 Py_ssize_t size,
4554 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004555{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004556 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4557}
4558
4559/*
4560 * Convert unicode into string object (MBCS).
4561 * Returns 0 if succeed, -1 otherwise.
4562 */
4563static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 const Py_UNICODE *p, /* unicode */
4565 int size) /* size of unicode */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004566{
4567 int mbcssize = 0;
4568 Py_ssize_t n = 0;
4569
4570 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004571
4572 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004573 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4575 if (mbcssize == 0) {
4576 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4577 return -1;
4578 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004579 }
4580
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004581 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 /* Create string object */
4583 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4584 if (*repr == NULL)
4585 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004586 }
4587 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 /* Extend string object */
4589 n = PyBytes_Size(*repr);
4590 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4591 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004592 }
4593
4594 /* Do the conversion */
4595 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 char *s = PyBytes_AS_STRING(*repr) + n;
4597 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4598 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4599 return -1;
4600 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004601 }
4602
4603 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004604}
4605
4606PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 Py_ssize_t size,
4608 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004609{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004610 PyObject *repr = NULL;
4611 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004612
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004613#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004615 if (size > INT_MAX)
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 ret = encode_mbcs(&repr, p, INT_MAX);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004617 else
4618#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004619 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004620
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004621 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 Py_XDECREF(repr);
4623 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004624 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004625
4626#ifdef NEED_RETRY
4627 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 p += INT_MAX;
4629 size -= INT_MAX;
4630 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004631 }
4632#endif
4633
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004634 return repr;
4635}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004636
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004637PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4638{
4639 if (!PyUnicode_Check(unicode)) {
4640 PyErr_BadArgument();
4641 return NULL;
4642 }
4643 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 PyUnicode_GET_SIZE(unicode),
4645 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004646}
4647
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004648#undef NEED_RETRY
4649
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004650#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004651
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652/* --- Character Mapping Codec -------------------------------------------- */
4653
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 Py_ssize_t size,
4656 PyObject *mapping,
4657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004660 Py_ssize_t startinpos;
4661 Py_ssize_t endinpos;
4662 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 PyUnicodeObject *v;
4665 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004666 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004667 PyObject *errorHandler = NULL;
4668 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004669 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004671
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 /* Default to Latin-1 */
4673 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675
4676 v = _PyUnicode_New(size);
4677 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004683 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 mapstring = PyUnicode_AS_UNICODE(mapping);
4685 maplen = PyUnicode_GET_SIZE(mapping);
4686 while (s < e) {
4687 unsigned char ch = *s;
4688 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 if (ch < maplen)
4691 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 if (x == 0xfffe) {
4694 /* undefined mapping */
4695 outpos = p-PyUnicode_AS_UNICODE(v);
4696 startinpos = s-starts;
4697 endinpos = startinpos+1;
4698 if (unicode_decode_call_errorhandler(
4699 errors, &errorHandler,
4700 "charmap", "character maps to <undefined>",
4701 &starts, &e, &startinpos, &endinpos, &exc, &s,
4702 &v, &outpos, &p)) {
4703 goto onError;
4704 }
4705 continue;
4706 }
4707 *p++ = x;
4708 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004709 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004710 }
4711 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 while (s < e) {
4713 unsigned char ch = *s;
4714 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004715
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4717 w = PyLong_FromLong((long)ch);
4718 if (w == NULL)
4719 goto onError;
4720 x = PyObject_GetItem(mapping, w);
4721 Py_DECREF(w);
4722 if (x == NULL) {
4723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4724 /* No mapping found means: mapping is undefined. */
4725 PyErr_Clear();
4726 x = Py_None;
4727 Py_INCREF(x);
4728 } else
4729 goto onError;
4730 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004731
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 /* Apply mapping */
4733 if (PyLong_Check(x)) {
4734 long value = PyLong_AS_LONG(x);
4735 if (value < 0 || value > 65535) {
4736 PyErr_SetString(PyExc_TypeError,
4737 "character mapping must be in range(65536)");
4738 Py_DECREF(x);
4739 goto onError;
4740 }
4741 *p++ = (Py_UNICODE)value;
4742 }
4743 else if (x == Py_None) {
4744 /* undefined mapping */
4745 outpos = p-PyUnicode_AS_UNICODE(v);
4746 startinpos = s-starts;
4747 endinpos = startinpos+1;
4748 if (unicode_decode_call_errorhandler(
4749 errors, &errorHandler,
4750 "charmap", "character maps to <undefined>",
4751 &starts, &e, &startinpos, &endinpos, &exc, &s,
4752 &v, &outpos, &p)) {
4753 Py_DECREF(x);
4754 goto onError;
4755 }
4756 Py_DECREF(x);
4757 continue;
4758 }
4759 else if (PyUnicode_Check(x)) {
4760 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004761
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 if (targetsize == 1)
4763 /* 1-1 mapping */
4764 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004765
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 else if (targetsize > 1) {
4767 /* 1-n mapping */
4768 if (targetsize > extrachars) {
4769 /* resize first */
4770 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4771 Py_ssize_t needed = (targetsize - extrachars) + \
4772 (targetsize << 2);
4773 extrachars += needed;
4774 /* XXX overflow detection missing */
4775 if (_PyUnicode_Resize(&v,
4776 PyUnicode_GET_SIZE(v) + needed) < 0) {
4777 Py_DECREF(x);
4778 goto onError;
4779 }
4780 p = PyUnicode_AS_UNICODE(v) + oldpos;
4781 }
4782 Py_UNICODE_COPY(p,
4783 PyUnicode_AS_UNICODE(x),
4784 targetsize);
4785 p += targetsize;
4786 extrachars -= targetsize;
4787 }
4788 /* 1-0 mapping: skip the character */
4789 }
4790 else {
4791 /* wrong return value */
4792 PyErr_SetString(PyExc_TypeError,
4793 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004794 Py_DECREF(x);
4795 goto onError;
4796 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 Py_DECREF(x);
4798 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 }
4801 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4803 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 Py_XDECREF(errorHandler);
4805 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004807
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004809 Py_XDECREF(errorHandler);
4810 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 Py_XDECREF(v);
4812 return NULL;
4813}
4814
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004815/* Charmap encoding: the lookup table */
4816
4817struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 PyObject_HEAD
4819 unsigned char level1[32];
4820 int count2, count3;
4821 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004822};
4823
4824static PyObject*
4825encoding_map_size(PyObject *obj, PyObject* args)
4826{
4827 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004828 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004830}
4831
4832static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004833 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 PyDoc_STR("Return the size (in bytes) of this object") },
4835 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004836};
4837
4838static void
4839encoding_map_dealloc(PyObject* o)
4840{
Benjamin Peterson14339b62009-01-31 16:36:08 +00004841 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004842}
4843
4844static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00004845 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 "EncodingMap", /*tp_name*/
4847 sizeof(struct encoding_map), /*tp_basicsize*/
4848 0, /*tp_itemsize*/
4849 /* methods */
4850 encoding_map_dealloc, /*tp_dealloc*/
4851 0, /*tp_print*/
4852 0, /*tp_getattr*/
4853 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00004854 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 0, /*tp_repr*/
4856 0, /*tp_as_number*/
4857 0, /*tp_as_sequence*/
4858 0, /*tp_as_mapping*/
4859 0, /*tp_hash*/
4860 0, /*tp_call*/
4861 0, /*tp_str*/
4862 0, /*tp_getattro*/
4863 0, /*tp_setattro*/
4864 0, /*tp_as_buffer*/
4865 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4866 0, /*tp_doc*/
4867 0, /*tp_traverse*/
4868 0, /*tp_clear*/
4869 0, /*tp_richcompare*/
4870 0, /*tp_weaklistoffset*/
4871 0, /*tp_iter*/
4872 0, /*tp_iternext*/
4873 encoding_map_methods, /*tp_methods*/
4874 0, /*tp_members*/
4875 0, /*tp_getset*/
4876 0, /*tp_base*/
4877 0, /*tp_dict*/
4878 0, /*tp_descr_get*/
4879 0, /*tp_descr_set*/
4880 0, /*tp_dictoffset*/
4881 0, /*tp_init*/
4882 0, /*tp_alloc*/
4883 0, /*tp_new*/
4884 0, /*tp_free*/
4885 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004886};
4887
4888PyObject*
4889PyUnicode_BuildEncodingMap(PyObject* string)
4890{
4891 Py_UNICODE *decode;
4892 PyObject *result;
4893 struct encoding_map *mresult;
4894 int i;
4895 int need_dict = 0;
4896 unsigned char level1[32];
4897 unsigned char level2[512];
4898 unsigned char *mlevel1, *mlevel2, *mlevel3;
4899 int count2 = 0, count3 = 0;
4900
4901 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4902 PyErr_BadArgument();
4903 return NULL;
4904 }
4905 decode = PyUnicode_AS_UNICODE(string);
4906 memset(level1, 0xFF, sizeof level1);
4907 memset(level2, 0xFF, sizeof level2);
4908
4909 /* If there isn't a one-to-one mapping of NULL to \0,
4910 or if there are non-BMP characters, we need to use
4911 a mapping dictionary. */
4912 if (decode[0] != 0)
4913 need_dict = 1;
4914 for (i = 1; i < 256; i++) {
4915 int l1, l2;
4916 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00004917#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004918 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00004919#endif
4920 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004921 need_dict = 1;
4922 break;
4923 }
4924 if (decode[i] == 0xFFFE)
4925 /* unmapped character */
4926 continue;
4927 l1 = decode[i] >> 11;
4928 l2 = decode[i] >> 7;
4929 if (level1[l1] == 0xFF)
4930 level1[l1] = count2++;
4931 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00004932 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004933 }
4934
4935 if (count2 >= 0xFF || count3 >= 0xFF)
4936 need_dict = 1;
4937
4938 if (need_dict) {
4939 PyObject *result = PyDict_New();
4940 PyObject *key, *value;
4941 if (!result)
4942 return NULL;
4943 for (i = 0; i < 256; i++) {
4944 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004945 key = PyLong_FromLong(decode[i]);
4946 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004947 if (!key || !value)
4948 goto failed1;
4949 if (PyDict_SetItem(result, key, value) == -1)
4950 goto failed1;
4951 Py_DECREF(key);
4952 Py_DECREF(value);
4953 }
4954 return result;
4955 failed1:
4956 Py_XDECREF(key);
4957 Py_XDECREF(value);
4958 Py_DECREF(result);
4959 return NULL;
4960 }
4961
4962 /* Create a three-level trie */
4963 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4964 16*count2 + 128*count3 - 1);
4965 if (!result)
4966 return PyErr_NoMemory();
4967 PyObject_Init(result, &EncodingMapType);
4968 mresult = (struct encoding_map*)result;
4969 mresult->count2 = count2;
4970 mresult->count3 = count3;
4971 mlevel1 = mresult->level1;
4972 mlevel2 = mresult->level23;
4973 mlevel3 = mresult->level23 + 16*count2;
4974 memcpy(mlevel1, level1, 32);
4975 memset(mlevel2, 0xFF, 16*count2);
4976 memset(mlevel3, 0, 128*count3);
4977 count3 = 0;
4978 for (i = 1; i < 256; i++) {
4979 int o1, o2, o3, i2, i3;
4980 if (decode[i] == 0xFFFE)
4981 /* unmapped character */
4982 continue;
4983 o1 = decode[i]>>11;
4984 o2 = (decode[i]>>7) & 0xF;
4985 i2 = 16*mlevel1[o1] + o2;
4986 if (mlevel2[i2] == 0xFF)
4987 mlevel2[i2] = count3++;
4988 o3 = decode[i] & 0x7F;
4989 i3 = 128*mlevel2[i2] + o3;
4990 mlevel3[i3] = i;
4991 }
4992 return result;
4993}
4994
4995static int
4996encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4997{
4998 struct encoding_map *map = (struct encoding_map*)mapping;
4999 int l1 = c>>11;
5000 int l2 = (c>>7) & 0xF;
5001 int l3 = c & 0x7F;
5002 int i;
5003
5004#ifdef Py_UNICODE_WIDE
5005 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005007 }
5008#endif
5009 if (c == 0)
5010 return 0;
5011 /* level 1*/
5012 i = map->level1[l1];
5013 if (i == 0xFF) {
5014 return -1;
5015 }
5016 /* level 2*/
5017 i = map->level23[16*i+l2];
5018 if (i == 0xFF) {
5019 return -1;
5020 }
5021 /* level 3 */
5022 i = map->level23[16*map->count2 + 128*i + l3];
5023 if (i == 0) {
5024 return -1;
5025 }
5026 return i;
5027}
5028
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005029/* Lookup the character ch in the mapping. If the character
5030 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005031 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033{
Christian Heimes217cfd12007-12-02 14:31:20 +00005034 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035 PyObject *x;
5036
5037 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005039 x = PyObject_GetItem(mapping, w);
5040 Py_DECREF(w);
5041 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5043 /* No mapping found means: mapping is undefined. */
5044 PyErr_Clear();
5045 x = Py_None;
5046 Py_INCREF(x);
5047 return x;
5048 } else
5049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005051 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005053 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 long value = PyLong_AS_LONG(x);
5055 if (value < 0 || value > 255) {
5056 PyErr_SetString(PyExc_TypeError,
5057 "character mapping must be in range(256)");
5058 Py_DECREF(x);
5059 return NULL;
5060 }
5061 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005063 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 /* wrong return value */
5067 PyErr_Format(PyExc_TypeError,
5068 "character mapping must return integer, bytes or None, not %.400s",
5069 x->ob_type->tp_name);
5070 Py_DECREF(x);
5071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 }
5073}
5074
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005075static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005076charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005077{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005078 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5079 /* exponentially overallocate to minimize reallocations */
5080 if (requiredsize < 2*outsize)
5081 requiredsize = 2*outsize;
5082 if (_PyBytes_Resize(outobj, requiredsize))
5083 return -1;
5084 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005085}
5086
Benjamin Peterson14339b62009-01-31 16:36:08 +00005087typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005089}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005090/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005091 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092 space is available. Return a new reference to the object that
5093 was put in the output buffer, or Py_None, if the mapping was undefined
5094 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005095 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005096static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005097charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005100 PyObject *rep;
5101 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005102 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103
Christian Heimes90aa7642007-12-19 02:45:37 +00005104 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005105 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005107 if (res == -1)
5108 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 if (outsize<requiredsize)
5110 if (charmapencode_resize(outobj, outpos, requiredsize))
5111 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005112 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 outstart[(*outpos)++] = (char)res;
5114 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005115 }
5116
5117 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005120 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 Py_DECREF(rep);
5122 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005123 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 if (PyLong_Check(rep)) {
5125 Py_ssize_t requiredsize = *outpos+1;
5126 if (outsize<requiredsize)
5127 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5128 Py_DECREF(rep);
5129 return enc_EXCEPTION;
5130 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005131 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005133 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 else {
5135 const char *repchars = PyBytes_AS_STRING(rep);
5136 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5137 Py_ssize_t requiredsize = *outpos+repsize;
5138 if (outsize<requiredsize)
5139 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5140 Py_DECREF(rep);
5141 return enc_EXCEPTION;
5142 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005143 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 memcpy(outstart + *outpos, repchars, repsize);
5145 *outpos += repsize;
5146 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005147 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005148 Py_DECREF(rep);
5149 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150}
5151
5152/* handle an error in PyUnicode_EncodeCharmap
5153 Return 0 on success, -1 on error */
5154static
5155int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005156 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005157 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005158 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005159 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160{
5161 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005162 Py_ssize_t repsize;
5163 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 Py_UNICODE *uni2;
5165 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005166 Py_ssize_t collstartpos = *inpos;
5167 Py_ssize_t collendpos = *inpos+1;
5168 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005169 char *encoding = "charmap";
5170 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005171 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005173 /* find all unencodable characters */
5174 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005175 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005176 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005177 int res = encoding_map_lookup(p[collendpos], mapping);
5178 if (res != -1)
5179 break;
5180 ++collendpos;
5181 continue;
5182 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005183
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 rep = charmapencode_lookup(p[collendpos], mapping);
5185 if (rep==NULL)
5186 return -1;
5187 else if (rep!=Py_None) {
5188 Py_DECREF(rep);
5189 break;
5190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005191 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 }
5194 /* cache callback name lookup
5195 * (if not done yet, i.e. it's the first error) */
5196 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 if ((errors==NULL) || (!strcmp(errors, "strict")))
5198 *known_errorHandler = 1;
5199 else if (!strcmp(errors, "replace"))
5200 *known_errorHandler = 2;
5201 else if (!strcmp(errors, "ignore"))
5202 *known_errorHandler = 3;
5203 else if (!strcmp(errors, "xmlcharrefreplace"))
5204 *known_errorHandler = 4;
5205 else
5206 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005207 }
5208 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005209 case 1: /* strict */
5210 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5211 return -1;
5212 case 2: /* replace */
5213 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 x = charmapencode_output('?', mapping, res, respos);
5215 if (x==enc_EXCEPTION) {
5216 return -1;
5217 }
5218 else if (x==enc_FAILED) {
5219 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5220 return -1;
5221 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005222 }
5223 /* fall through */
5224 case 3: /* ignore */
5225 *inpos = collendpos;
5226 break;
5227 case 4: /* xmlcharrefreplace */
5228 /* generate replacement (temporarily (mis)uses p) */
5229 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 char buffer[2+29+1+1];
5231 char *cp;
5232 sprintf(buffer, "&#%d;", (int)p[collpos]);
5233 for (cp = buffer; *cp; ++cp) {
5234 x = charmapencode_output(*cp, mapping, res, respos);
5235 if (x==enc_EXCEPTION)
5236 return -1;
5237 else if (x==enc_FAILED) {
5238 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5239 return -1;
5240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005241 }
5242 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005243 *inpos = collendpos;
5244 break;
5245 default:
5246 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 encoding, reason, p, size, exceptionObject,
5248 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005249 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005251 if (PyBytes_Check(repunicode)) {
5252 /* Directly copy bytes result to output. */
5253 Py_ssize_t outsize = PyBytes_Size(*res);
5254 Py_ssize_t requiredsize;
5255 repsize = PyBytes_Size(repunicode);
5256 requiredsize = *respos + repsize;
5257 if (requiredsize > outsize)
5258 /* Make room for all additional bytes. */
5259 if (charmapencode_resize(res, respos, requiredsize)) {
5260 Py_DECREF(repunicode);
5261 return -1;
5262 }
5263 memcpy(PyBytes_AsString(*res) + *respos,
5264 PyBytes_AsString(repunicode), repsize);
5265 *respos += repsize;
5266 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005267 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005268 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005270 /* generate replacement */
5271 repsize = PyUnicode_GET_SIZE(repunicode);
5272 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 x = charmapencode_output(*uni2, mapping, res, respos);
5274 if (x==enc_EXCEPTION) {
5275 return -1;
5276 }
5277 else if (x==enc_FAILED) {
5278 Py_DECREF(repunicode);
5279 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5280 return -1;
5281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005282 }
5283 *inpos = newpos;
5284 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285 }
5286 return 0;
5287}
5288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 Py_ssize_t size,
5291 PyObject *mapping,
5292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294 /* output object */
5295 PyObject *res = NULL;
5296 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005300 PyObject *errorHandler = NULL;
5301 PyObject *exc = NULL;
5302 /* the following variable is used for caching string comparisons
5303 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5304 * 3=ignore, 4=xmlcharrefreplace */
5305 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306
5307 /* Default to Latin-1 */
5308 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005311 /* allocate enough for a simple encoding without
5312 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005313 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005314 if (res == NULL)
5315 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005316 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005319 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 /* try to encode it */
5321 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5322 if (x==enc_EXCEPTION) /* error */
5323 goto onError;
5324 if (x==enc_FAILED) { /* unencodable character */
5325 if (charmap_encoding_error(p, size, &inpos, mapping,
5326 &exc,
5327 &known_errorHandler, &errorHandler, errors,
5328 &res, &respos)) {
5329 goto onError;
5330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005331 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 else
5333 /* done with this character => adjust input position */
5334 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005338 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005339 if (_PyBytes_Resize(&res, respos) < 0)
5340 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 Py_XDECREF(exc);
5343 Py_XDECREF(errorHandler);
5344 return res;
5345
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 Py_XDECREF(res);
5348 Py_XDECREF(exc);
5349 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 return NULL;
5351}
5352
5353PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355{
5356 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 PyErr_BadArgument();
5358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 }
5360 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 PyUnicode_GET_SIZE(unicode),
5362 mapping,
5363 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364}
5365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366/* create or adjust a UnicodeTranslateError */
5367static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 const Py_UNICODE *unicode, Py_ssize_t size,
5369 Py_ssize_t startpos, Py_ssize_t endpos,
5370 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005373 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 }
5376 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5378 goto onError;
5379 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5380 goto onError;
5381 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5382 goto onError;
5383 return;
5384 onError:
5385 Py_DECREF(*exceptionObject);
5386 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 }
5388}
5389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005390/* raises a UnicodeTranslateError */
5391static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 const Py_UNICODE *unicode, Py_ssize_t size,
5393 Py_ssize_t startpos, Py_ssize_t endpos,
5394 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395{
5396 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005400}
5401
5402/* error handling callback helper:
5403 build arguments, call the callback and check the arguments,
5404 put the result into newpos and return the replacement string, which
5405 has to be freed by the caller */
5406static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 PyObject **errorHandler,
5408 const char *reason,
5409 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5410 Py_ssize_t startpos, Py_ssize_t endpos,
5411 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005412{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005413 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005415 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 PyObject *restuple;
5417 PyObject *resunicode;
5418
5419 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005421 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005423 }
5424
5425 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429
5430 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005435 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 Py_DECREF(restuple);
5437 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 }
5439 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 &resunicode, &i_newpos)) {
5441 Py_DECREF(restuple);
5442 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005443 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005444 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005446 else
5447 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005448 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5450 Py_DECREF(restuple);
5451 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 Py_INCREF(resunicode);
5454 Py_DECREF(restuple);
5455 return resunicode;
5456}
5457
5458/* Lookup the character ch in the mapping and put the result in result,
5459 which must be decrefed by the caller.
5460 Return 0 on success, -1 on error */
5461static
5462int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5463{
Christian Heimes217cfd12007-12-02 14:31:20 +00005464 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005465 PyObject *x;
5466
5467 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005469 x = PyObject_GetItem(mapping, w);
5470 Py_DECREF(w);
5471 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5473 /* No mapping found means: use 1:1 mapping. */
5474 PyErr_Clear();
5475 *result = NULL;
5476 return 0;
5477 } else
5478 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 }
5480 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 *result = x;
5482 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005483 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005484 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 long value = PyLong_AS_LONG(x);
5486 long max = PyUnicode_GetMax();
5487 if (value < 0 || value > max) {
5488 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005489 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 Py_DECREF(x);
5491 return -1;
5492 }
5493 *result = x;
5494 return 0;
5495 }
5496 else if (PyUnicode_Check(x)) {
5497 *result = x;
5498 return 0;
5499 }
5500 else {
5501 /* wrong return value */
5502 PyErr_SetString(PyExc_TypeError,
5503 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005504 Py_DECREF(x);
5505 return -1;
5506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507}
5508/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 if not reallocate and adjust various state variables.
5510 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511static
Walter Dörwald4894c302003-10-24 14:25:28 +00005512int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005515 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005516 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 /* remember old output position */
5518 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5519 /* exponentially overallocate to minimize reallocations */
5520 if (requiredsize < 2 * oldsize)
5521 requiredsize = 2 * oldsize;
5522 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5523 return -1;
5524 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005525 }
5526 return 0;
5527}
5528/* lookup the character, put the result in the output string and adjust
5529 various state variables. Return a new reference to the object that
5530 was put in the output buffer in *result, or Py_None, if the mapping was
5531 undefined (in which case no character was written).
5532 The called must decref result.
5533 Return 0 on success, -1 on error. */
5534static
Walter Dörwald4894c302003-10-24 14:25:28 +00005535int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5537 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538{
Walter Dörwald4894c302003-10-24 14:25:28 +00005539 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005541 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 /* not found => default to 1:1 mapping */
5543 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 }
5545 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005547 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 /* no overflow check, because we know that the space is enough */
5549 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 }
5551 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5553 if (repsize==1) {
5554 /* no overflow check, because we know that the space is enough */
5555 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5556 }
5557 else if (repsize!=0) {
5558 /* more than one character */
5559 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5560 (insize - (curinp-startinp)) +
5561 repsize - 1;
5562 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5563 return -1;
5564 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5565 *outp += repsize;
5566 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 }
5568 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 return 0;
5571}
5572
5573PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 Py_ssize_t size,
5575 PyObject *mapping,
5576 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 /* output object */
5579 PyObject *res = NULL;
5580 /* pointers to the beginning and end+1 of input */
5581 const Py_UNICODE *startp = p;
5582 const Py_UNICODE *endp = p + size;
5583 /* pointer into the output */
5584 Py_UNICODE *str;
5585 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005586 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587 char *reason = "character maps to <undefined>";
5588 PyObject *errorHandler = NULL;
5589 PyObject *exc = NULL;
5590 /* the following variable is used for caching string comparisons
5591 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5592 * 3=ignore, 4=xmlcharrefreplace */
5593 int known_errorHandler = -1;
5594
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 PyErr_BadArgument();
5597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005599
5600 /* allocate enough for a simple 1:1 translation without
5601 replacements, if we need more, we'll resize */
5602 res = PyUnicode_FromUnicode(NULL, size);
5603 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005609 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 /* try to encode it */
5611 PyObject *x = NULL;
5612 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5613 Py_XDECREF(x);
5614 goto onError;
5615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005616 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 if (x!=Py_None) /* it worked => adjust input pointer */
5618 ++p;
5619 else { /* untranslatable character */
5620 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5621 Py_ssize_t repsize;
5622 Py_ssize_t newpos;
5623 Py_UNICODE *uni2;
5624 /* startpos for collecting untranslatable chars */
5625 const Py_UNICODE *collstart = p;
5626 const Py_UNICODE *collend = p+1;
5627 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 /* find all untranslatable characters */
5630 while (collend < endp) {
5631 if (charmaptranslate_lookup(*collend, mapping, &x))
5632 goto onError;
5633 Py_XDECREF(x);
5634 if (x!=Py_None)
5635 break;
5636 ++collend;
5637 }
5638 /* cache callback name lookup
5639 * (if not done yet, i.e. it's the first error) */
5640 if (known_errorHandler==-1) {
5641 if ((errors==NULL) || (!strcmp(errors, "strict")))
5642 known_errorHandler = 1;
5643 else if (!strcmp(errors, "replace"))
5644 known_errorHandler = 2;
5645 else if (!strcmp(errors, "ignore"))
5646 known_errorHandler = 3;
5647 else if (!strcmp(errors, "xmlcharrefreplace"))
5648 known_errorHandler = 4;
5649 else
5650 known_errorHandler = 0;
5651 }
5652 switch (known_errorHandler) {
5653 case 1: /* strict */
5654 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005655 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 case 2: /* replace */
5657 /* No need to check for space, this is a 1:1 replacement */
5658 for (coll = collstart; coll<collend; ++coll)
5659 *str++ = '?';
5660 /* fall through */
5661 case 3: /* ignore */
5662 p = collend;
5663 break;
5664 case 4: /* xmlcharrefreplace */
5665 /* generate replacement (temporarily (mis)uses p) */
5666 for (p = collstart; p < collend; ++p) {
5667 char buffer[2+29+1+1];
5668 char *cp;
5669 sprintf(buffer, "&#%d;", (int)*p);
5670 if (charmaptranslate_makespace(&res, &str,
5671 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5672 goto onError;
5673 for (cp = buffer; *cp; ++cp)
5674 *str++ = *cp;
5675 }
5676 p = collend;
5677 break;
5678 default:
5679 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5680 reason, startp, size, &exc,
5681 collstart-startp, collend-startp, &newpos);
5682 if (repunicode == NULL)
5683 goto onError;
5684 /* generate replacement */
5685 repsize = PyUnicode_GET_SIZE(repunicode);
5686 if (charmaptranslate_makespace(&res, &str,
5687 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5688 Py_DECREF(repunicode);
5689 goto onError;
5690 }
5691 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5692 *str++ = *uni2;
5693 p = startp + newpos;
5694 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005696 }
5697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 /* Resize if we allocated to much */
5699 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005700 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 if (PyUnicode_Resize(&res, respos) < 0)
5702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 }
5704 Py_XDECREF(exc);
5705 Py_XDECREF(errorHandler);
5706 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 Py_XDECREF(res);
5710 Py_XDECREF(exc);
5711 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 return NULL;
5713}
5714
5715PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 PyObject *mapping,
5717 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718{
5719 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005720
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 str = PyUnicode_FromObject(str);
5722 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 PyUnicode_GET_SIZE(str),
5726 mapping,
5727 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 Py_DECREF(str);
5729 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005730
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 Py_XDECREF(str);
5733 return NULL;
5734}
Tim Petersced69f82003-09-16 20:30:58 +00005735
Guido van Rossum9e896b32000-04-05 20:11:21 +00005736/* --- Decimal Encoder ---------------------------------------------------- */
5737
5738int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 Py_ssize_t length,
5740 char *output,
5741 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005742{
5743 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 PyObject *errorHandler = NULL;
5745 PyObject *exc = NULL;
5746 const char *encoding = "decimal";
5747 const char *reason = "invalid decimal Unicode string";
5748 /* the following variable is used for caching string comparisons
5749 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5750 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005751
5752 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 PyErr_BadArgument();
5754 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005755 }
5756
5757 p = s;
5758 end = s + length;
5759 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 register Py_UNICODE ch = *p;
5761 int decimal;
5762 PyObject *repunicode;
5763 Py_ssize_t repsize;
5764 Py_ssize_t newpos;
5765 Py_UNICODE *uni2;
5766 Py_UNICODE *collstart;
5767 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005768
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005770 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 ++p;
5772 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005773 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 decimal = Py_UNICODE_TODECIMAL(ch);
5775 if (decimal >= 0) {
5776 *output++ = '0' + decimal;
5777 ++p;
5778 continue;
5779 }
5780 if (0 < ch && ch < 256) {
5781 *output++ = (char)ch;
5782 ++p;
5783 continue;
5784 }
5785 /* All other characters are considered unencodable */
5786 collstart = p;
5787 collend = p+1;
5788 while (collend < end) {
5789 if ((0 < *collend && *collend < 256) ||
5790 !Py_UNICODE_ISSPACE(*collend) ||
5791 Py_UNICODE_TODECIMAL(*collend))
5792 break;
5793 }
5794 /* cache callback name lookup
5795 * (if not done yet, i.e. it's the first error) */
5796 if (known_errorHandler==-1) {
5797 if ((errors==NULL) || (!strcmp(errors, "strict")))
5798 known_errorHandler = 1;
5799 else if (!strcmp(errors, "replace"))
5800 known_errorHandler = 2;
5801 else if (!strcmp(errors, "ignore"))
5802 known_errorHandler = 3;
5803 else if (!strcmp(errors, "xmlcharrefreplace"))
5804 known_errorHandler = 4;
5805 else
5806 known_errorHandler = 0;
5807 }
5808 switch (known_errorHandler) {
5809 case 1: /* strict */
5810 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5811 goto onError;
5812 case 2: /* replace */
5813 for (p = collstart; p < collend; ++p)
5814 *output++ = '?';
5815 /* fall through */
5816 case 3: /* ignore */
5817 p = collend;
5818 break;
5819 case 4: /* xmlcharrefreplace */
5820 /* generate replacement (temporarily (mis)uses p) */
5821 for (p = collstart; p < collend; ++p)
5822 output += sprintf(output, "&#%d;", (int)*p);
5823 p = collend;
5824 break;
5825 default:
5826 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5827 encoding, reason, s, length, &exc,
5828 collstart-s, collend-s, &newpos);
5829 if (repunicode == NULL)
5830 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005831 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00005832 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005833 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
5834 Py_DECREF(repunicode);
5835 goto onError;
5836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 /* generate replacement */
5838 repsize = PyUnicode_GET_SIZE(repunicode);
5839 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5840 Py_UNICODE ch = *uni2;
5841 if (Py_UNICODE_ISSPACE(ch))
5842 *output++ = ' ';
5843 else {
5844 decimal = Py_UNICODE_TODECIMAL(ch);
5845 if (decimal >= 0)
5846 *output++ = '0' + decimal;
5847 else if (0 < ch && ch < 256)
5848 *output++ = (char)ch;
5849 else {
5850 Py_DECREF(repunicode);
5851 raise_encode_exception(&exc, encoding,
5852 s, length, collstart-s, collend-s, reason);
5853 goto onError;
5854 }
5855 }
5856 }
5857 p = s + newpos;
5858 Py_DECREF(repunicode);
5859 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005860 }
5861 /* 0-terminate the output string */
5862 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 Py_XDECREF(exc);
5864 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005865 return 0;
5866
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 Py_XDECREF(exc);
5869 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005870 return -1;
5871}
5872
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873/* --- Helpers ------------------------------------------------------------ */
5874
Eric Smith8c663262007-08-25 02:26:07 +00005875#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005876#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005877
Thomas Wouters477c8d52006-05-27 19:21:47 +00005878#include "stringlib/count.h"
5879#include "stringlib/find.h"
5880#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005881#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005882
Eric Smith5807c412008-05-11 21:00:57 +00005883#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00005884#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00005885#include "stringlib/localeutil.h"
5886
Thomas Wouters477c8d52006-05-27 19:21:47 +00005887/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005888#define ADJUST_INDICES(start, end, len) \
5889 if (end > len) \
5890 end = len; \
5891 else if (end < 0) { \
5892 end += len; \
5893 if (end < 0) \
5894 end = 0; \
5895 } \
5896 if (start < 0) { \
5897 start += len; \
5898 if (start < 0) \
5899 start = 0; \
5900 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00005901
Martin v. Löwis18e16552006-02-15 17:27:45 +00005902Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005903 PyObject *substr,
5904 Py_ssize_t start,
5905 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005907 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005908 PyUnicodeObject* str_obj;
5909 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005910
Thomas Wouters477c8d52006-05-27 19:21:47 +00005911 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5912 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005914 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5915 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 Py_DECREF(str_obj);
5917 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 }
Tim Petersced69f82003-09-16 20:30:58 +00005919
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005920 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005921 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005922 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5923 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00005924 );
5925
5926 Py_DECREF(sub_obj);
5927 Py_DECREF(str_obj);
5928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 return result;
5930}
5931
Martin v. Löwis18e16552006-02-15 17:27:45 +00005932Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005933 PyObject *sub,
5934 Py_ssize_t start,
5935 Py_ssize_t end,
5936 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005938 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005941 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005943 sub = PyUnicode_FromObject(sub);
5944 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 Py_DECREF(str);
5946 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 }
Tim Petersced69f82003-09-16 20:30:58 +00005948
Thomas Wouters477c8d52006-05-27 19:21:47 +00005949 if (direction > 0)
5950 result = stringlib_find_slice(
5951 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5952 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5953 start, end
5954 );
5955 else
5956 result = stringlib_rfind_slice(
5957 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5958 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5959 start, end
5960 );
5961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005963 Py_DECREF(sub);
5964
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 return result;
5966}
5967
Tim Petersced69f82003-09-16 20:30:58 +00005968static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 PyUnicodeObject *substring,
5971 Py_ssize_t start,
5972 Py_ssize_t end,
5973 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 if (substring->length == 0)
5976 return 1;
5977
Antoine Pitrouf2c54842010-01-13 08:07:53 +00005978 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 end -= substring->length;
5980 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
5983 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 if (Py_UNICODE_MATCH(self, end, substring))
5985 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 } else {
5987 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 }
5990
5991 return 0;
5992}
5993
Martin v. Löwis18e16552006-02-15 17:27:45 +00005994Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 PyObject *substr,
5996 Py_ssize_t start,
5997 Py_ssize_t end,
5998 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006000 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006001
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 str = PyUnicode_FromObject(str);
6003 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 substr = PyUnicode_FromObject(substr);
6006 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 Py_DECREF(str);
6008 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 }
Tim Petersced69f82003-09-16 20:30:58 +00006010
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 (PyUnicodeObject *)substr,
6013 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 Py_DECREF(str);
6015 Py_DECREF(substr);
6016 return result;
6017}
6018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019/* Apply fixfct filter to the Unicode object self and return a
6020 reference to the modified object */
6021
Tim Petersced69f82003-09-16 20:30:58 +00006022static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
6026
6027 PyUnicodeObject *u;
6028
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006029 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006032
6033 Py_UNICODE_COPY(u->str, self->str, self->length);
6034
Tim Peters7a29bd52001-09-12 03:03:31 +00006035 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 /* fixfct should return TRUE if it modified the buffer. If
6037 FALSE, return a reference to the original buffer instead
6038 (to save space, not time) */
6039 Py_INCREF(self);
6040 Py_DECREF(u);
6041 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 }
6043 return (PyObject*) u;
6044}
6045
Tim Petersced69f82003-09-16 20:30:58 +00006046static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047int fixupper(PyUnicodeObject *self)
6048{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006049 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 Py_UNICODE *s = self->str;
6051 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006055
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 ch = Py_UNICODE_TOUPPER(*s);
6057 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 *s = ch;
6060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 s++;
6062 }
6063
6064 return status;
6065}
6066
Tim Petersced69f82003-09-16 20:30:58 +00006067static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068int fixlower(PyUnicodeObject *self)
6069{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006070 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 Py_UNICODE *s = self->str;
6072 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006073
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006076
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 ch = Py_UNICODE_TOLOWER(*s);
6078 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 *s = ch;
6081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 s++;
6083 }
6084
6085 return status;
6086}
6087
Tim Petersced69f82003-09-16 20:30:58 +00006088static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089int fixswapcase(PyUnicodeObject *self)
6090{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006091 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 Py_UNICODE *s = self->str;
6093 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006094
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 while (len-- > 0) {
6096 if (Py_UNICODE_ISUPPER(*s)) {
6097 *s = Py_UNICODE_TOLOWER(*s);
6098 status = 1;
6099 } else if (Py_UNICODE_ISLOWER(*s)) {
6100 *s = Py_UNICODE_TOUPPER(*s);
6101 status = 1;
6102 }
6103 s++;
6104 }
6105
6106 return status;
6107}
6108
Tim Petersced69f82003-09-16 20:30:58 +00006109static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110int fixcapitalize(PyUnicodeObject *self)
6111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006113 Py_UNICODE *s = self->str;
6114 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006115
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006116 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006118 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 *s = Py_UNICODE_TOUPPER(*s);
6120 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006122 s++;
6123 while (--len > 0) {
6124 if (Py_UNICODE_ISUPPER(*s)) {
6125 *s = Py_UNICODE_TOLOWER(*s);
6126 status = 1;
6127 }
6128 s++;
6129 }
6130 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131}
6132
6133static
6134int fixtitle(PyUnicodeObject *self)
6135{
6136 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6137 register Py_UNICODE *e;
6138 int previous_is_cased;
6139
6140 /* Shortcut for single character strings */
6141 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6143 if (*p != ch) {
6144 *p = ch;
6145 return 1;
6146 }
6147 else
6148 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Tim Petersced69f82003-09-16 20:30:58 +00006150
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 e = p + PyUnicode_GET_SIZE(self);
6152 previous_is_cased = 0;
6153 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006155
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 if (previous_is_cased)
6157 *p = Py_UNICODE_TOLOWER(ch);
6158 else
6159 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006160
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 if (Py_UNICODE_ISLOWER(ch) ||
6162 Py_UNICODE_ISUPPER(ch) ||
6163 Py_UNICODE_ISTITLE(ch))
6164 previous_is_cased = 1;
6165 else
6166 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 }
6168 return 1;
6169}
6170
Tim Peters8ce9f162004-08-27 01:49:32 +00006171PyObject *
6172PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
Skip Montanaro6543b452004-09-16 03:28:13 +00006174 const Py_UNICODE blank = ' ';
6175 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006176 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006177 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006178 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6179 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006180 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6181 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006182 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006183 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
Tim Peters05eba1f2004-08-27 21:32:02 +00006185 fseq = PySequence_Fast(seq, "");
6186 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006187 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006188 }
6189
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006190 /* NOTE: the following code can't call back into Python code,
6191 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006192 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006193
Tim Peters05eba1f2004-08-27 21:32:02 +00006194 seqlen = PySequence_Fast_GET_SIZE(fseq);
6195 /* If empty sequence, return u"". */
6196 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006197 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6198 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006199 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006200 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006201 /* If singleton sequence with an exact Unicode, return that. */
6202 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 item = items[0];
6204 if (PyUnicode_CheckExact(item)) {
6205 Py_INCREF(item);
6206 res = (PyUnicodeObject *)item;
6207 goto Done;
6208 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006209 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006210 else {
6211 /* Set up sep and seplen */
6212 if (separator == NULL) {
6213 sep = &blank;
6214 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006215 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006216 else {
6217 if (!PyUnicode_Check(separator)) {
6218 PyErr_Format(PyExc_TypeError,
6219 "separator: expected str instance,"
6220 " %.80s found",
6221 Py_TYPE(separator)->tp_name);
6222 goto onError;
6223 }
6224 sep = PyUnicode_AS_UNICODE(separator);
6225 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006226 }
6227 }
6228
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006229 /* There are at least two things to join, or else we have a subclass
6230 * of str in the sequence.
6231 * Do a pre-pass to figure out the total amount of space we'll
6232 * need (sz), and see whether all argument are strings.
6233 */
6234 sz = 0;
6235 for (i = 0; i < seqlen; i++) {
6236 const Py_ssize_t old_sz = sz;
6237 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 if (!PyUnicode_Check(item)) {
6239 PyErr_Format(PyExc_TypeError,
6240 "sequence item %zd: expected str instance,"
6241 " %.80s found",
6242 i, Py_TYPE(item)->tp_name);
6243 goto onError;
6244 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006245 sz += PyUnicode_GET_SIZE(item);
6246 if (i != 0)
6247 sz += seplen;
6248 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6249 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006251 goto onError;
6252 }
6253 }
Tim Petersced69f82003-09-16 20:30:58 +00006254
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006255 res = _PyUnicode_New(sz);
6256 if (res == NULL)
6257 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006258
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006259 /* Catenate everything. */
6260 res_p = PyUnicode_AS_UNICODE(res);
6261 for (i = 0; i < seqlen; ++i) {
6262 Py_ssize_t itemlen;
6263 item = items[i];
6264 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 /* Copy item, and maybe the separator. */
6266 if (i) {
6267 Py_UNICODE_COPY(res_p, sep, seplen);
6268 res_p += seplen;
6269 }
6270 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6271 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006272 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006273
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006275 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 return (PyObject *)res;
6277
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006279 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006280 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 return NULL;
6282}
6283
Tim Petersced69f82003-09-16 20:30:58 +00006284static
6285PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 Py_ssize_t left,
6287 Py_ssize_t right,
6288 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289{
6290 PyUnicodeObject *u;
6291
6292 if (left < 0)
6293 left = 0;
6294 if (right < 0)
6295 right = 0;
6296
Tim Peters7a29bd52001-09-12 03:03:31 +00006297 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 Py_INCREF(self);
6299 return self;
6300 }
6301
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006302 if (left > PY_SSIZE_T_MAX - self->length ||
6303 right > PY_SSIZE_T_MAX - (left + self->length)) {
6304 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6305 return NULL;
6306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 u = _PyUnicode_New(left + self->length + right);
6308 if (u) {
6309 if (left)
6310 Py_UNICODE_FILL(u->str, fill, left);
6311 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6312 if (right)
6313 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6314 }
6315
6316 return u;
6317}
6318
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006319PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
6323 string = PyUnicode_FromObject(string);
6324 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006327 list = stringlib_splitlines(
6328 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6329 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330
6331 Py_DECREF(string);
6332 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333}
6334
Tim Petersced69f82003-09-16 20:30:58 +00006335static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 PyUnicodeObject *substring,
6338 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006341 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006344 return stringlib_split_whitespace(
6345 (PyObject*) self, self->str, self->length, maxcount
6346 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006348 return stringlib_split(
6349 (PyObject*) self, self->str, self->length,
6350 substring->str, substring->length,
6351 maxcount
6352 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353}
6354
Tim Petersced69f82003-09-16 20:30:58 +00006355static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006356PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 PyUnicodeObject *substring,
6358 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006359{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006360 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006361 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006362
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006363 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006364 return stringlib_rsplit_whitespace(
6365 (PyObject*) self, self->str, self->length, maxcount
6366 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006367
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006368 return stringlib_rsplit(
6369 (PyObject*) self, self->str, self->length,
6370 substring->str, substring->length,
6371 maxcount
6372 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006373}
6374
6375static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 PyUnicodeObject *str1,
6378 PyUnicodeObject *str2,
6379 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380{
6381 PyUnicodeObject *u;
6382
6383 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006385 else if (maxcount == 0 || self->length == 0)
6386 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Thomas Wouters477c8d52006-05-27 19:21:47 +00006388 if (str1->length == str2->length) {
6389 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006390 if (str1->length == 0)
6391 goto nothing;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006392 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006393 if (str1->length == 1) {
6394 /* replace characters */
6395 Py_UNICODE u1, u2;
6396 if (!findchar(self->str, self->length, str1->str[0]))
6397 goto nothing;
6398 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6399 if (!u)
6400 return NULL;
6401 Py_UNICODE_COPY(u->str, self->str, self->length);
6402 u1 = str1->str[0];
6403 u2 = str2->str[0];
6404 for (i = 0; i < u->length; i++)
6405 if (u->str[i] == u1) {
6406 if (--maxcount < 0)
6407 break;
6408 u->str[i] = u2;
6409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006411 i = stringlib_find(
6412 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006414 if (i < 0)
6415 goto nothing;
6416 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6417 if (!u)
6418 return NULL;
6419 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006420
6421 /* change everything in-place, starting with this one */
6422 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6423 i += str1->length;
6424
6425 while ( --maxcount > 0) {
6426 i = stringlib_find(self->str+i, self->length-i,
6427 str1->str, str1->length,
6428 i);
6429 if (i == -1)
6430 break;
6431 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6432 i += str1->length;
6433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436
6437 Py_ssize_t n, i, j, e;
6438 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 Py_UNICODE *p;
6440
6441 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006442 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6443 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006444 if (n == 0)
6445 goto nothing;
6446 /* new_size = self->length + n * (str2->length - str1->length)); */
6447 delta = (str2->length - str1->length);
6448 if (delta == 0) {
6449 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006451 product = n * (str2->length - str1->length);
6452 if ((product / (str2->length - str1->length)) != n) {
6453 PyErr_SetString(PyExc_OverflowError,
6454 "replace string is too long");
6455 return NULL;
6456 }
6457 new_size = self->length + product;
6458 if (new_size < 0) {
6459 PyErr_SetString(PyExc_OverflowError,
6460 "replace string is too long");
6461 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 }
6463 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006464 u = _PyUnicode_New(new_size);
6465 if (!u)
6466 return NULL;
6467 i = 0;
6468 p = u->str;
6469 e = self->length - str1->length;
6470 if (str1->length > 0) {
6471 while (n-- > 0) {
6472 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006473 j = stringlib_find(self->str+i, self->length-i,
6474 str1->str, str1->length,
6475 i);
6476 if (j == -1)
6477 break;
6478 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006479 /* copy unchanged part [i:j] */
6480 Py_UNICODE_COPY(p, self->str+i, j-i);
6481 p += j - i;
6482 }
6483 /* copy substitution string */
6484 if (str2->length > 0) {
6485 Py_UNICODE_COPY(p, str2->str, str2->length);
6486 p += str2->length;
6487 }
6488 i = j + str1->length;
6489 }
6490 if (i < self->length)
6491 /* copy tail [i:] */
6492 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6493 } else {
6494 /* interleave */
6495 while (n > 0) {
6496 Py_UNICODE_COPY(p, str2->str, str2->length);
6497 p += str2->length;
6498 if (--n <= 0)
6499 break;
6500 *p++ = self->str[i++];
6501 }
6502 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006506
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006508 /* nothing to replace; return original string (when possible) */
6509 if (PyUnicode_CheckExact(self)) {
6510 Py_INCREF(self);
6511 return (PyObject *) self;
6512 }
6513 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514}
6515
6516/* --- Unicode Object Methods --------------------------------------------- */
6517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006518PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520\n\
6521Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006522characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
6524static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006525unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 return fixup(self, fixtitle);
6528}
6529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006530PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532\n\
6533Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006534have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535
6536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006537unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 return fixup(self, fixcapitalize);
6540}
6541
6542#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006543PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545\n\
6546Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006547normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548
6549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006550unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551{
6552 PyObject *list;
6553 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006554 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 /* Split into words */
6557 list = split(self, NULL, -1);
6558 if (!list)
6559 return NULL;
6560
6561 /* Capitalize each word */
6562 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6563 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 if (item == NULL)
6566 goto onError;
6567 Py_DECREF(PyList_GET_ITEM(list, i));
6568 PyList_SET_ITEM(list, i, item);
6569 }
6570
6571 /* Join the words to form a new string */
6572 item = PyUnicode_Join(NULL, list);
6573
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 Py_DECREF(list);
6576 return (PyObject *)item;
6577}
6578#endif
6579
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006580/* Argument converter. Coerces to a single unicode character */
6581
6582static int
6583convert_uc(PyObject *obj, void *addr)
6584{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6586 PyObject *uniobj;
6587 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006588
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 uniobj = PyUnicode_FromObject(obj);
6590 if (uniobj == NULL) {
6591 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006593 return 0;
6594 }
6595 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6596 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598 Py_DECREF(uniobj);
6599 return 0;
6600 }
6601 unistr = PyUnicode_AS_UNICODE(uniobj);
6602 *fillcharloc = unistr[0];
6603 Py_DECREF(uniobj);
6604 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006605}
6606
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006607PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006610Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006611done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
6613static PyObject *
6614unicode_center(PyUnicodeObject *self, PyObject *args)
6615{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006616 Py_ssize_t marg, left;
6617 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006618 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
Thomas Woutersde017742006-02-16 19:34:37 +00006620 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 return NULL;
6622
Tim Peters7a29bd52001-09-12 03:03:31 +00006623 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 Py_INCREF(self);
6625 return (PyObject*) self;
6626 }
6627
6628 marg = width - self->length;
6629 left = marg / 2 + (marg & width & 1);
6630
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006631 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632}
6633
Marc-André Lemburge5034372000-08-08 08:04:29 +00006634#if 0
6635
6636/* This code should go into some future Unicode collation support
6637 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006638 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006639
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006640/* speedy UTF-16 code point order comparison */
6641/* gleaned from: */
6642/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6643
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006644static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006645{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006646 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006647 0, 0, 0, 0, 0, 0, 0, 0,
6648 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006649 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006650};
6651
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652static int
6653unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6654{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006655 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006656
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 Py_UNICODE *s1 = str1->str;
6658 Py_UNICODE *s2 = str2->str;
6659
6660 len1 = str1->length;
6661 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006662
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006664 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006665
6666 c1 = *s1++;
6667 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006668
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 if (c1 > (1<<11) * 26)
6670 c1 += utf16Fixup[c1>>11];
6671 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006672 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006673 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006674
6675 if (c1 != c2)
6676 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006677
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006678 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 }
6680
6681 return (len1 < len2) ? -1 : (len1 != len2);
6682}
6683
Marc-André Lemburge5034372000-08-08 08:04:29 +00006684#else
6685
6686static int
6687unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6688{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006689 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006690
6691 Py_UNICODE *s1 = str1->str;
6692 Py_UNICODE *s2 = str2->str;
6693
6694 len1 = str1->length;
6695 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006696
Marc-André Lemburge5034372000-08-08 08:04:29 +00006697 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006698 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006699
Fredrik Lundh45714e92001-06-26 16:39:36 +00006700 c1 = *s1++;
6701 c2 = *s2++;
6702
6703 if (c1 != c2)
6704 return (c1 < c2) ? -1 : 1;
6705
Marc-André Lemburge5034372000-08-08 08:04:29 +00006706 len1--; len2--;
6707 }
6708
6709 return (len1 < len2) ? -1 : (len1 != len2);
6710}
6711
6712#endif
6713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006717 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6718 return unicode_compare((PyUnicodeObject *)left,
6719 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006720 PyErr_Format(PyExc_TypeError,
6721 "Can't compare %.100s and %.100s",
6722 left->ob_type->tp_name,
6723 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 return -1;
6725}
6726
Martin v. Löwis5b222132007-06-10 09:51:05 +00006727int
6728PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6729{
6730 int i;
6731 Py_UNICODE *id;
6732 assert(PyUnicode_Check(uni));
6733 id = PyUnicode_AS_UNICODE(uni);
6734 /* Compare Unicode string and source character set string */
6735 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 if (id[i] != str[i])
6737 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006738 /* This check keeps Python strings that end in '\0' from comparing equal
6739 to C strings identical up to that point. */
6740 if (PyUnicode_GET_SIZE(uni) != i)
6741 /* We'll say the Python string is longer. */
6742 return 1;
Martin v. Löwis5b222132007-06-10 09:51:05 +00006743 if (id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006745 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006747 return 0;
6748}
6749
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006750
Benjamin Peterson29060642009-01-31 22:14:21 +00006751#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006752 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006753
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006754PyObject *PyUnicode_RichCompare(PyObject *left,
6755 PyObject *right,
6756 int op)
6757{
6758 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006759
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006760 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6761 PyObject *v;
6762 if (((PyUnicodeObject *) left)->length !=
6763 ((PyUnicodeObject *) right)->length) {
6764 if (op == Py_EQ) {
6765 Py_INCREF(Py_False);
6766 return Py_False;
6767 }
6768 if (op == Py_NE) {
6769 Py_INCREF(Py_True);
6770 return Py_True;
6771 }
6772 }
6773 if (left == right)
6774 result = 0;
6775 else
6776 result = unicode_compare((PyUnicodeObject *)left,
6777 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006778
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006779 /* Convert the return value to a Boolean */
6780 switch (op) {
6781 case Py_EQ:
6782 v = TEST_COND(result == 0);
6783 break;
6784 case Py_NE:
6785 v = TEST_COND(result != 0);
6786 break;
6787 case Py_LE:
6788 v = TEST_COND(result <= 0);
6789 break;
6790 case Py_GE:
6791 v = TEST_COND(result >= 0);
6792 break;
6793 case Py_LT:
6794 v = TEST_COND(result == -1);
6795 break;
6796 case Py_GT:
6797 v = TEST_COND(result == 1);
6798 break;
6799 default:
6800 PyErr_BadArgument();
6801 return NULL;
6802 }
6803 Py_INCREF(v);
6804 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006805 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006806
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006807 Py_INCREF(Py_NotImplemented);
6808 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006809}
6810
Guido van Rossum403d68b2000-03-13 15:55:09 +00006811int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006813{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006814 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006816
6817 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006818 sub = PyUnicode_FromObject(element);
6819 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 PyErr_Format(PyExc_TypeError,
6821 "'in <string>' requires string as left operand, not %s",
6822 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006823 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006824 }
6825
Thomas Wouters477c8d52006-05-27 19:21:47 +00006826 str = PyUnicode_FromObject(container);
6827 if (!str) {
6828 Py_DECREF(sub);
6829 return -1;
6830 }
6831
6832 result = stringlib_contains_obj(str, sub);
6833
6834 Py_DECREF(str);
6835 Py_DECREF(sub);
6836
Guido van Rossum403d68b2000-03-13 15:55:09 +00006837 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006838}
6839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840/* Concat to string or Unicode object giving a new Unicode object. */
6841
6842PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844{
6845 PyUnicodeObject *u = NULL, *v = NULL, *w;
6846
6847 /* Coerce the two arguments */
6848 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6849 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6852 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
6855 /* Shortcuts */
6856 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 Py_DECREF(v);
6858 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 }
6860 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 Py_DECREF(u);
6862 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 }
6864
6865 /* Concat the two Unicode strings */
6866 w = _PyUnicode_New(u->length + v->length);
6867 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 Py_UNICODE_COPY(w->str, u->str, u->length);
6870 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6871
6872 Py_DECREF(u);
6873 Py_DECREF(v);
6874 return (PyObject *)w;
6875
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 Py_XDECREF(u);
6878 Py_XDECREF(v);
6879 return NULL;
6880}
6881
Walter Dörwald1ab83302007-05-18 17:15:44 +00006882void
6883PyUnicode_Append(PyObject **pleft, PyObject *right)
6884{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006885 PyObject *new;
6886 if (*pleft == NULL)
6887 return;
6888 if (right == NULL || !PyUnicode_Check(*pleft)) {
6889 Py_DECREF(*pleft);
6890 *pleft = NULL;
6891 return;
6892 }
6893 new = PyUnicode_Concat(*pleft, right);
6894 Py_DECREF(*pleft);
6895 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00006896}
6897
6898void
6899PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6900{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 PyUnicode_Append(pleft, right);
6902 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00006903}
6904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006905PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006908Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006909string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006910interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
6912static PyObject *
6913unicode_count(PyUnicodeObject *self, PyObject *args)
6914{
6915 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006916 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006917 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 PyObject *result;
6919
Guido van Rossumb8872e62000-05-09 14:14:27 +00006920 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 return NULL;
6923
6924 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006925 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006928
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006929 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00006930 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006931 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006932 substring->str, substring->length,
6933 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00006934 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
6936 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 return result;
6939}
6940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006941PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006944Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006945to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006946handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006947a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6948'xmlcharrefreplace' as well as any other name registered with\n\
6949codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950
6951static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00006952unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953{
Benjamin Peterson308d6372009-09-18 21:42:35 +00006954 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 char *encoding = NULL;
6956 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006957 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006958
Benjamin Peterson308d6372009-09-18 21:42:35 +00006959 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6960 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006962 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006963 if (v == NULL)
6964 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006965 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006966 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006967 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006968 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006969 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006970 Py_DECREF(v);
6971 return NULL;
6972 }
6973 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006974
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006976 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006977}
6978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006979PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981\n\
6982Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006983If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984
6985static PyObject*
6986unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6987{
6988 Py_UNICODE *e;
6989 Py_UNICODE *p;
6990 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006991 Py_UNICODE *qe;
6992 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 PyUnicodeObject *u;
6994 int tabsize = 8;
6995
6996 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998
Thomas Wouters7e474022000-07-16 12:04:32 +00006999 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007000 i = 0; /* chars up to and including most recent \n or \r */
7001 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7002 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 for (p = self->str; p < e; p++)
7004 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 if (tabsize > 0) {
7006 incr = tabsize - (j % tabsize); /* cannot overflow */
7007 if (j > PY_SSIZE_T_MAX - incr)
7008 goto overflow1;
7009 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007010 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 if (j > PY_SSIZE_T_MAX - 1)
7014 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 j++;
7016 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 if (i > PY_SSIZE_T_MAX - j)
7018 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007020 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 }
7022 }
7023
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007024 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007026
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 /* Second pass: create output string and fill it */
7028 u = _PyUnicode_New(i + j);
7029 if (!u)
7030 return NULL;
7031
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007032 j = 0; /* same as in first pass */
7033 q = u->str; /* next output char */
7034 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
7036 for (p = self->str; p < e; p++)
7037 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 if (tabsize > 0) {
7039 i = tabsize - (j % tabsize);
7040 j += i;
7041 while (i--) {
7042 if (q >= qe)
7043 goto overflow2;
7044 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007045 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007047 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 else {
7049 if (q >= qe)
7050 goto overflow2;
7051 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007052 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 if (*p == '\n' || *p == '\r')
7054 j = 0;
7055 }
7056
7057 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007058
7059 overflow2:
7060 Py_DECREF(u);
7061 overflow1:
7062 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007066PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068\n\
7069Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007070such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071arguments start and end are interpreted as in slice notation.\n\
7072\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007073Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
7075static PyObject *
7076unicode_find(PyUnicodeObject *self, PyObject *args)
7077{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007078 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007079 Py_ssize_t start;
7080 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007081 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082
Christian Heimes9cd17752007-11-18 19:35:23 +00007083 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085
Thomas Wouters477c8d52006-05-27 19:21:47 +00007086 result = stringlib_find_slice(
7087 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7088 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7089 start, end
7090 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091
7092 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007093
Christian Heimes217cfd12007-12-02 14:31:20 +00007094 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095}
7096
7097static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007098unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099{
7100 if (index < 0 || index >= self->length) {
7101 PyErr_SetString(PyExc_IndexError, "string index out of range");
7102 return NULL;
7103 }
7104
7105 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7106}
7107
Guido van Rossumc2504932007-09-18 19:42:40 +00007108/* Believe it or not, this produces the same value for ASCII strings
7109 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007111unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112{
Guido van Rossumc2504932007-09-18 19:42:40 +00007113 Py_ssize_t len;
7114 Py_UNICODE *p;
7115 long x;
7116
7117 if (self->hash != -1)
7118 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007119 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007120 p = self->str;
7121 x = *p << 7;
7122 while (--len >= 0)
7123 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007124 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007125 if (x == -1)
7126 x = -2;
7127 self->hash = x;
7128 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129}
7130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007134Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135
7136static PyObject *
7137unicode_index(PyUnicodeObject *self, PyObject *args)
7138{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007140 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007141 Py_ssize_t start;
7142 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
Christian Heimes9cd17752007-11-18 19:35:23 +00007144 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
Thomas Wouters477c8d52006-05-27 19:21:47 +00007147 result = stringlib_find_slice(
7148 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7149 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7150 start, end
7151 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
7153 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007154
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 if (result < 0) {
7156 PyErr_SetString(PyExc_ValueError, "substring not found");
7157 return NULL;
7158 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007159
Christian Heimes217cfd12007-12-02 14:31:20 +00007160 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161}
7162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007163PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007166Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007167at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168
7169static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007170unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171{
7172 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7173 register const Py_UNICODE *e;
7174 int cased;
7175
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 /* Shortcut for single character strings */
7177 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007180 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007181 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007183
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 e = p + PyUnicode_GET_SIZE(self);
7185 cased = 0;
7186 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007188
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7190 return PyBool_FromLong(0);
7191 else if (!cased && Py_UNICODE_ISLOWER(ch))
7192 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007194 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195}
7196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007197PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007200Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007201at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202
7203static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007204unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205{
7206 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7207 register const Py_UNICODE *e;
7208 int cased;
7209
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 /* Shortcut for single character strings */
7211 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007214 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007215 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007217
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 e = p + PyUnicode_GET_SIZE(self);
7219 cased = 0;
7220 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007222
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7224 return PyBool_FromLong(0);
7225 else if (!cased && Py_UNICODE_ISUPPER(ch))
7226 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007228 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229}
7230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007231PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007234Return True if S is a titlecased string and there is at least one\n\
7235character in S, i.e. upper- and titlecase characters may only\n\
7236follow uncased characters and lowercase characters only cased ones.\n\
7237Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238
7239static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007240unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241{
7242 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7243 register const Py_UNICODE *e;
7244 int cased, previous_is_cased;
7245
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 /* Shortcut for single character strings */
7247 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7249 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007251 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007252 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007253 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007254
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 e = p + PyUnicode_GET_SIZE(self);
7256 cased = 0;
7257 previous_is_cased = 0;
7258 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007260
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7262 if (previous_is_cased)
7263 return PyBool_FromLong(0);
7264 previous_is_cased = 1;
7265 cased = 1;
7266 }
7267 else if (Py_UNICODE_ISLOWER(ch)) {
7268 if (!previous_is_cased)
7269 return PyBool_FromLong(0);
7270 previous_is_cased = 1;
7271 cased = 1;
7272 }
7273 else
7274 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007276 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277}
7278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007279PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007282Return True if all characters in S are whitespace\n\
7283and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
7285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007286unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287{
7288 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7289 register const Py_UNICODE *e;
7290
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 /* Shortcut for single character strings */
7292 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 Py_UNICODE_ISSPACE(*p))
7294 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007296 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007297 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007299
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 e = p + PyUnicode_GET_SIZE(self);
7301 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007302 if (!Py_UNICODE_ISSPACE(*p))
7303 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007305 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306}
7307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007308PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007310\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007311Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007312and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007313
7314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007315unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007316{
7317 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7318 register const Py_UNICODE *e;
7319
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007320 /* Shortcut for single character strings */
7321 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 Py_UNICODE_ISALPHA(*p))
7323 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007324
7325 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007326 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007328
7329 e = p + PyUnicode_GET_SIZE(self);
7330 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 if (!Py_UNICODE_ISALPHA(*p))
7332 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007333 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007334 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007335}
7336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007337PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007339\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007340Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007341and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007342
7343static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007344unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007345{
7346 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7347 register const Py_UNICODE *e;
7348
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007349 /* Shortcut for single character strings */
7350 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 Py_UNICODE_ISALNUM(*p))
7352 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007353
7354 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007355 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007357
7358 e = p + PyUnicode_GET_SIZE(self);
7359 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 if (!Py_UNICODE_ISALNUM(*p))
7361 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007362 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007363 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007364}
7365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007366PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007369Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007370False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371
7372static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007373unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374{
7375 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7376 register const Py_UNICODE *e;
7377
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378 /* Shortcut for single character strings */
7379 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 Py_UNICODE_ISDECIMAL(*p))
7381 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007383 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007384 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007386
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 e = p + PyUnicode_GET_SIZE(self);
7388 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 if (!Py_UNICODE_ISDECIMAL(*p))
7390 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007392 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393}
7394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007395PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007398Return True if all characters in S are digits\n\
7399and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
7401static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007402unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403{
7404 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7405 register const Py_UNICODE *e;
7406
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 /* Shortcut for single character strings */
7408 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 Py_UNICODE_ISDIGIT(*p))
7410 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007412 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007413 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007415
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 e = p + PyUnicode_GET_SIZE(self);
7417 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 if (!Py_UNICODE_ISDIGIT(*p))
7419 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007421 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422}
7423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007427Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007428False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
7430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007431unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432{
7433 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7434 register const Py_UNICODE *e;
7435
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 /* Shortcut for single character strings */
7437 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 Py_UNICODE_ISNUMERIC(*p))
7439 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007441 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007442 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 e = p + PyUnicode_GET_SIZE(self);
7446 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 if (!Py_UNICODE_ISNUMERIC(*p))
7448 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007450 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451}
7452
Martin v. Löwis47383402007-08-15 07:32:56 +00007453int
7454PyUnicode_IsIdentifier(PyObject *self)
7455{
7456 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7457 register const Py_UNICODE *e;
7458
7459 /* Special case for empty strings */
7460 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007462
7463 /* PEP 3131 says that the first character must be in
7464 XID_Start and subsequent characters in XID_Continue,
7465 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007466 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007467 letters, digits, underscore). However, given the current
7468 definition of XID_Start and XID_Continue, it is sufficient
7469 to check just for these, except that _ must be allowed
7470 as starting an identifier. */
7471 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7472 return 0;
7473
7474 e = p + PyUnicode_GET_SIZE(self);
7475 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 if (!_PyUnicode_IsXidContinue(*p))
7477 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007478 }
7479 return 1;
7480}
7481
7482PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007484\n\
7485Return True if S is a valid identifier according\n\
7486to the language definition.");
7487
7488static PyObject*
7489unicode_isidentifier(PyObject *self)
7490{
7491 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7492}
7493
Georg Brandl559e5d72008-06-11 18:37:52 +00007494PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007496\n\
7497Return True if all characters in S are considered\n\
7498printable in repr() or S is empty, False otherwise.");
7499
7500static PyObject*
7501unicode_isprintable(PyObject *self)
7502{
7503 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7504 register const Py_UNICODE *e;
7505
7506 /* Shortcut for single character strings */
7507 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7508 Py_RETURN_TRUE;
7509 }
7510
7511 e = p + PyUnicode_GET_SIZE(self);
7512 for (; p < e; p++) {
7513 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7514 Py_RETURN_FALSE;
7515 }
7516 }
7517 Py_RETURN_TRUE;
7518}
7519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007520PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007521 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522\n\
7523Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007524iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
7526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007527unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007529 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530}
7531
Martin v. Löwis18e16552006-02-15 17:27:45 +00007532static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533unicode_length(PyUnicodeObject *self)
7534{
7535 return self->length;
7536}
7537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007541Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007542done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
7544static PyObject *
7545unicode_ljust(PyUnicodeObject *self, PyObject *args)
7546{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007547 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007548 Py_UNICODE fillchar = ' ';
7549
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007550 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 return NULL;
7552
Tim Peters7a29bd52001-09-12 03:03:31 +00007553 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 Py_INCREF(self);
7555 return (PyObject*) self;
7556 }
7557
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007558 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559}
7560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007561PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007564Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
7566static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007567unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 return fixup(self, fixlower);
7570}
7571
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007572#define LEFTSTRIP 0
7573#define RIGHTSTRIP 1
7574#define BOTHSTRIP 2
7575
7576/* Arrays indexed by above */
7577static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7578
7579#define STRIPNAME(i) (stripformat[i]+3)
7580
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007581/* externally visible for str.strip(unicode) */
7582PyObject *
7583_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7584{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007585 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7586 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7587 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7588 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7589 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007590
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007592
Benjamin Peterson14339b62009-01-31 16:36:08 +00007593 i = 0;
7594 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7596 i++;
7597 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007598 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007599
Benjamin Peterson14339b62009-01-31 16:36:08 +00007600 j = len;
7601 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 do {
7603 j--;
7604 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7605 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007606 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007607
Benjamin Peterson14339b62009-01-31 16:36:08 +00007608 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 Py_INCREF(self);
7610 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 }
7612 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007614}
7615
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
7617static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007618do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7621 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007622
Benjamin Peterson14339b62009-01-31 16:36:08 +00007623 i = 0;
7624 if (striptype != RIGHTSTRIP) {
7625 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7626 i++;
7627 }
7628 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007629
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 j = len;
7631 if (striptype != LEFTSTRIP) {
7632 do {
7633 j--;
7634 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7635 j++;
7636 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007637
Benjamin Peterson14339b62009-01-31 16:36:08 +00007638 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7639 Py_INCREF(self);
7640 return (PyObject*)self;
7641 }
7642 else
7643 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007646
7647static PyObject *
7648do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7649{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007650 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007651
Benjamin Peterson14339b62009-01-31 16:36:08 +00007652 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7653 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007654
Benjamin Peterson14339b62009-01-31 16:36:08 +00007655 if (sep != NULL && sep != Py_None) {
7656 if (PyUnicode_Check(sep))
7657 return _PyUnicode_XStrip(self, striptype, sep);
7658 else {
7659 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 "%s arg must be None or str",
7661 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007662 return NULL;
7663 }
7664 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007665
Benjamin Peterson14339b62009-01-31 16:36:08 +00007666 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007667}
7668
7669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007670PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007672\n\
7673Return a copy of the string S with leading and trailing\n\
7674whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007675If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007676
7677static PyObject *
7678unicode_strip(PyUnicodeObject *self, PyObject *args)
7679{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007680 if (PyTuple_GET_SIZE(args) == 0)
7681 return do_strip(self, BOTHSTRIP); /* Common case */
7682 else
7683 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007684}
7685
7686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007687PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007689\n\
7690Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007691If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007692
7693static PyObject *
7694unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7695{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007696 if (PyTuple_GET_SIZE(args) == 0)
7697 return do_strip(self, LEFTSTRIP); /* Common case */
7698 else
7699 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007700}
7701
7702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007703PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007705\n\
7706Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007707If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007708
7709static PyObject *
7710unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7711{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007712 if (PyTuple_GET_SIZE(args) == 0)
7713 return do_strip(self, RIGHTSTRIP); /* Common case */
7714 else
7715 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007716}
7717
7718
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007720unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721{
7722 PyUnicodeObject *u;
7723 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007724 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007725 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
Georg Brandl222de0f2009-04-12 12:01:50 +00007727 if (len < 1) {
7728 Py_INCREF(unicode_empty);
7729 return (PyObject *)unicode_empty;
7730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
Tim Peters7a29bd52001-09-12 03:03:31 +00007732 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 /* no repeat, return original string */
7734 Py_INCREF(str);
7735 return (PyObject*) str;
7736 }
Tim Peters8f422462000-09-09 06:13:41 +00007737
7738 /* ensure # of chars needed doesn't overflow int and # of bytes
7739 * needed doesn't overflow size_t
7740 */
7741 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007742 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007743 PyErr_SetString(PyExc_OverflowError,
7744 "repeated string is too long");
7745 return NULL;
7746 }
7747 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7748 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7749 PyErr_SetString(PyExc_OverflowError,
7750 "repeated string is too long");
7751 return NULL;
7752 }
7753 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 if (!u)
7755 return NULL;
7756
7757 p = u->str;
7758
Georg Brandl222de0f2009-04-12 12:01:50 +00007759 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007760 Py_UNICODE_FILL(p, str->str[0], len);
7761 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007762 Py_ssize_t done = str->length; /* number of characters copied this far */
7763 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007765 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007766 Py_UNICODE_COPY(p+done, p, n);
7767 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 }
7770
7771 return (PyObject*) u;
7772}
7773
7774PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 PyObject *subobj,
7776 PyObject *replobj,
7777 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778{
7779 PyObject *self;
7780 PyObject *str1;
7781 PyObject *str2;
7782 PyObject *result;
7783
7784 self = PyUnicode_FromObject(obj);
7785 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 str1 = PyUnicode_FromObject(subobj);
7788 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 Py_DECREF(self);
7790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 }
7792 str2 = PyUnicode_FromObject(replobj);
7793 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 Py_DECREF(self);
7795 Py_DECREF(str1);
7796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 }
Tim Petersced69f82003-09-16 20:30:58 +00007798 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 (PyUnicodeObject *)str1,
7800 (PyUnicodeObject *)str2,
7801 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 Py_DECREF(self);
7803 Py_DECREF(str1);
7804 Py_DECREF(str2);
7805 return result;
7806}
7807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007808PyDoc_STRVAR(replace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 "S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810\n\
7811Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007812old replaced by new. If the optional argument count is\n\
7813given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814
7815static PyObject*
7816unicode_replace(PyUnicodeObject *self, PyObject *args)
7817{
7818 PyUnicodeObject *str1;
7819 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007820 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 PyObject *result;
7822
Martin v. Löwis18e16552006-02-15 17:27:45 +00007823 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 return NULL;
7825 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7826 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007829 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 Py_DECREF(str1);
7831 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007832 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
7834 result = replace(self, str1, str2, maxcount);
7835
7836 Py_DECREF(str1);
7837 Py_DECREF(str2);
7838 return result;
7839}
7840
7841static
7842PyObject *unicode_repr(PyObject *unicode)
7843{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007844 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007845 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007846 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7847 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7848
7849 /* XXX(nnorwitz): rather than over-allocating, it would be
7850 better to choose a different scheme. Perhaps scan the
7851 first N-chars of the string and allocate based on that size.
7852 */
7853 /* Initial allocation is based on the longest-possible unichr
7854 escape.
7855
7856 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7857 unichr, so in this case it's the longest unichr escape. In
7858 narrow (UTF-16) builds this is five chars per source unichr
7859 since there are two unichrs in the surrogate pair, so in narrow
7860 (UTF-16) builds it's not the longest unichr escape.
7861
7862 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7863 so in the narrow (UTF-16) build case it's the longest unichr
7864 escape.
7865 */
7866
Walter Dörwald1ab83302007-05-18 17:15:44 +00007867 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00007869#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007871#else
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00007873#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007875 if (repr == NULL)
7876 return NULL;
7877
Walter Dörwald1ab83302007-05-18 17:15:44 +00007878 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007879
7880 /* Add quote */
7881 *p++ = (findchar(s, size, '\'') &&
7882 !findchar(s, size, '"')) ? '"' : '\'';
7883 while (size-- > 0) {
7884 Py_UNICODE ch = *s++;
7885
7886 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007887 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007888 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007889 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007890 continue;
7891 }
7892
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007894 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007895 *p++ = '\\';
7896 *p++ = 't';
7897 }
7898 else if (ch == '\n') {
7899 *p++ = '\\';
7900 *p++ = 'n';
7901 }
7902 else if (ch == '\r') {
7903 *p++ = '\\';
7904 *p++ = 'r';
7905 }
7906
7907 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007908 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007909 *p++ = '\\';
7910 *p++ = 'x';
7911 *p++ = hexdigits[(ch >> 4) & 0x000F];
7912 *p++ = hexdigits[ch & 0x000F];
7913 }
7914
Georg Brandl559e5d72008-06-11 18:37:52 +00007915 /* Copy ASCII characters as-is */
7916 else if (ch < 0x7F) {
7917 *p++ = ch;
7918 }
7919
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00007921 else {
7922 Py_UCS4 ucs = ch;
7923
7924#ifndef Py_UNICODE_WIDE
7925 Py_UNICODE ch2 = 0;
7926 /* Get code point from surrogate pair */
7927 if (size > 0) {
7928 ch2 = *s;
7929 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007931 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007933 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00007934 size--;
7935 }
7936 }
7937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00007938 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00007939 (categories Z* and C* except ASCII space)
7940 */
7941 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7942 /* Map 8-bit characters to '\xhh' */
7943 if (ucs <= 0xff) {
7944 *p++ = '\\';
7945 *p++ = 'x';
7946 *p++ = hexdigits[(ch >> 4) & 0x000F];
7947 *p++ = hexdigits[ch & 0x000F];
7948 }
7949 /* Map 21-bit characters to '\U00xxxxxx' */
7950 else if (ucs >= 0x10000) {
7951 *p++ = '\\';
7952 *p++ = 'U';
7953 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7954 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7955 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7956 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7957 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7958 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7959 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7960 *p++ = hexdigits[ucs & 0x0000000F];
7961 }
7962 /* Map 16-bit characters to '\uxxxx' */
7963 else {
7964 *p++ = '\\';
7965 *p++ = 'u';
7966 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7967 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7968 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7969 *p++ = hexdigits[ucs & 0x000F];
7970 }
7971 }
7972 /* Copy characters as-is */
7973 else {
7974 *p++ = ch;
7975#ifndef Py_UNICODE_WIDE
7976 if (ucs >= 0x10000)
7977 *p++ = ch2;
7978#endif
7979 }
7980 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007981 }
7982 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007983 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007984
7985 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007986 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007987 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988}
7989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007990PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992\n\
7993Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007994such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995arguments start and end are interpreted as in slice notation.\n\
7996\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007997Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998
7999static PyObject *
8000unicode_rfind(PyUnicodeObject *self, PyObject *args)
8001{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008002 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008003 Py_ssize_t start;
8004 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008005 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
Christian Heimes9cd17752007-11-18 19:35:23 +00008007 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009
Thomas Wouters477c8d52006-05-27 19:21:47 +00008010 result = stringlib_rfind_slice(
8011 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8012 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8013 start, end
8014 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015
8016 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008017
Christian Heimes217cfd12007-12-02 14:31:20 +00008018 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019}
8020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008021PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008024Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025
8026static PyObject *
8027unicode_rindex(PyUnicodeObject *self, PyObject *args)
8028{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008029 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008030 Py_ssize_t start;
8031 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008032 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033
Christian Heimes9cd17752007-11-18 19:35:23 +00008034 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036
Thomas Wouters477c8d52006-05-27 19:21:47 +00008037 result = stringlib_rfind_slice(
8038 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8039 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8040 start, end
8041 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042
8043 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008044
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 if (result < 0) {
8046 PyErr_SetString(PyExc_ValueError, "substring not found");
8047 return NULL;
8048 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008049 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050}
8051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008052PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008055Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008056done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057
8058static PyObject *
8059unicode_rjust(PyUnicodeObject *self, PyObject *args)
8060{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008061 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008062 Py_UNICODE fillchar = ' ';
8063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008064 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 return NULL;
8066
Tim Peters7a29bd52001-09-12 03:03:31 +00008067 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 Py_INCREF(self);
8069 return (PyObject*) self;
8070 }
8071
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008072 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073}
8074
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 PyObject *sep,
8077 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
8079 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008080
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 s = PyUnicode_FromObject(s);
8082 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 if (sep != NULL) {
8085 sep = PyUnicode_FromObject(sep);
8086 if (sep == NULL) {
8087 Py_DECREF(s);
8088 return NULL;
8089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 }
8091
8092 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8093
8094 Py_DECREF(s);
8095 Py_XDECREF(sep);
8096 return result;
8097}
8098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008099PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101\n\
8102Return a list of the words in S, using sep as the\n\
8103delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008104splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008105whitespace string is a separator and empty strings are\n\
8106removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107
8108static PyObject*
8109unicode_split(PyUnicodeObject *self, PyObject *args)
8110{
8111 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113
Martin v. Löwis18e16552006-02-15 17:27:45 +00008114 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 return NULL;
8116
8117 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123}
8124
Thomas Wouters477c8d52006-05-27 19:21:47 +00008125PyObject *
8126PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8127{
8128 PyObject* str_obj;
8129 PyObject* sep_obj;
8130 PyObject* out;
8131
8132 str_obj = PyUnicode_FromObject(str_in);
8133 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008135 sep_obj = PyUnicode_FromObject(sep_in);
8136 if (!sep_obj) {
8137 Py_DECREF(str_obj);
8138 return NULL;
8139 }
8140
8141 out = stringlib_partition(
8142 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8143 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8144 );
8145
8146 Py_DECREF(sep_obj);
8147 Py_DECREF(str_obj);
8148
8149 return out;
8150}
8151
8152
8153PyObject *
8154PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8155{
8156 PyObject* str_obj;
8157 PyObject* sep_obj;
8158 PyObject* out;
8159
8160 str_obj = PyUnicode_FromObject(str_in);
8161 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008163 sep_obj = PyUnicode_FromObject(sep_in);
8164 if (!sep_obj) {
8165 Py_DECREF(str_obj);
8166 return NULL;
8167 }
8168
8169 out = stringlib_rpartition(
8170 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8171 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8172 );
8173
8174 Py_DECREF(sep_obj);
8175 Py_DECREF(str_obj);
8176
8177 return out;
8178}
8179
8180PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008182\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008183Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008184the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008185found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008186
8187static PyObject*
8188unicode_partition(PyUnicodeObject *self, PyObject *separator)
8189{
8190 return PyUnicode_Partition((PyObject *)self, separator);
8191}
8192
8193PyDoc_STRVAR(rpartition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 "S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008195\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008196Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008197the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008198separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008199
8200static PyObject*
8201unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8202{
8203 return PyUnicode_RPartition((PyObject *)self, separator);
8204}
8205
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008206PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 PyObject *sep,
8208 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008209{
8210 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008211
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008212 s = PyUnicode_FromObject(s);
8213 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 if (sep != NULL) {
8216 sep = PyUnicode_FromObject(sep);
8217 if (sep == NULL) {
8218 Py_DECREF(s);
8219 return NULL;
8220 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008221 }
8222
8223 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8224
8225 Py_DECREF(s);
8226 Py_XDECREF(sep);
8227 return result;
8228}
8229
8230PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008232\n\
8233Return a list of the words in S, using sep as the\n\
8234delimiter string, starting at the end of the string and\n\
8235working to the front. If maxsplit is given, at most maxsplit\n\
8236splits are done. If sep is not specified, any whitespace string\n\
8237is a separator.");
8238
8239static PyObject*
8240unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8241{
8242 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008243 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008244
Martin v. Löwis18e16552006-02-15 17:27:45 +00008245 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008246 return NULL;
8247
8248 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008250 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008252 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008254}
8255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008256PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258\n\
8259Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008260Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008261is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262
8263static PyObject*
8264unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8265{
Guido van Rossum86662912000-04-11 15:38:46 +00008266 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267
Guido van Rossum86662912000-04-11 15:38:46 +00008268 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 return NULL;
8270
Guido van Rossum86662912000-04-11 15:38:46 +00008271 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272}
8273
8274static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008275PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276{
Walter Dörwald346737f2007-05-31 10:44:43 +00008277 if (PyUnicode_CheckExact(self)) {
8278 Py_INCREF(self);
8279 return self;
8280 } else
8281 /* Subtype -- return genuine unicode string with the same value. */
8282 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8283 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284}
8285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008286PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288\n\
8289Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008290and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291
8292static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008293unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 return fixup(self, fixswapcase);
8296}
8297
Georg Brandlceee0772007-11-27 23:48:05 +00008298PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008300\n\
8301Return a translation table usable for str.translate().\n\
8302If there is only one argument, it must be a dictionary mapping Unicode\n\
8303ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008304Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008305If there are two arguments, they must be strings of equal length, and\n\
8306in the resulting dictionary, each character in x will be mapped to the\n\
8307character at the same position in y. If there is a third argument, it\n\
8308must be a string, whose characters will be mapped to None in the result.");
8309
8310static PyObject*
8311unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8312{
8313 PyObject *x, *y = NULL, *z = NULL;
8314 PyObject *new = NULL, *key, *value;
8315 Py_ssize_t i = 0;
8316 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317
Georg Brandlceee0772007-11-27 23:48:05 +00008318 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8319 return NULL;
8320 new = PyDict_New();
8321 if (!new)
8322 return NULL;
8323 if (y != NULL) {
8324 /* x must be a string too, of equal length */
8325 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8326 if (!PyUnicode_Check(x)) {
8327 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8328 "be a string if there is a second argument");
8329 goto err;
8330 }
8331 if (PyUnicode_GET_SIZE(x) != ylen) {
8332 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8333 "arguments must have equal length");
8334 goto err;
8335 }
8336 /* create entries for translating chars in x to those in y */
8337 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008338 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8339 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008340 if (!key || !value)
8341 goto err;
8342 res = PyDict_SetItem(new, key, value);
8343 Py_DECREF(key);
8344 Py_DECREF(value);
8345 if (res < 0)
8346 goto err;
8347 }
8348 /* create entries for deleting chars in z */
8349 if (z != NULL) {
8350 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008351 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008352 if (!key)
8353 goto err;
8354 res = PyDict_SetItem(new, key, Py_None);
8355 Py_DECREF(key);
8356 if (res < 0)
8357 goto err;
8358 }
8359 }
8360 } else {
8361 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008362 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008363 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8364 "to maketrans it must be a dict");
8365 goto err;
8366 }
8367 /* copy entries into the new dict, converting string keys to int keys */
8368 while (PyDict_Next(x, &i, &key, &value)) {
8369 if (PyUnicode_Check(key)) {
8370 /* convert string keys to integer keys */
8371 PyObject *newkey;
8372 if (PyUnicode_GET_SIZE(key) != 1) {
8373 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8374 "table must be of length 1");
8375 goto err;
8376 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008377 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008378 if (!newkey)
8379 goto err;
8380 res = PyDict_SetItem(new, newkey, value);
8381 Py_DECREF(newkey);
8382 if (res < 0)
8383 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008384 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008385 /* just keep integer keys */
8386 if (PyDict_SetItem(new, key, value) < 0)
8387 goto err;
8388 } else {
8389 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8390 "be strings or integers");
8391 goto err;
8392 }
8393 }
8394 }
8395 return new;
8396 err:
8397 Py_DECREF(new);
8398 return NULL;
8399}
8400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008401PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403\n\
8404Return a copy of the string S, where all characters have been mapped\n\
8405through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008406Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008407Unmapped characters are left untouched. Characters mapped to None\n\
8408are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409
8410static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008411unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412{
Georg Brandlceee0772007-11-27 23:48:05 +00008413 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414}
8415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008416PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008419Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420
8421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008422unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 return fixup(self, fixupper);
8425}
8426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008427PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008430Pad a numeric string S with zeros on the left, to fill a field\n\
8431of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432
8433static PyObject *
8434unicode_zfill(PyUnicodeObject *self, PyObject *args)
8435{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 PyUnicodeObject *u;
8438
Martin v. Löwis18e16552006-02-15 17:27:45 +00008439 Py_ssize_t width;
8440 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441 return NULL;
8442
8443 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008444 if (PyUnicode_CheckExact(self)) {
8445 Py_INCREF(self);
8446 return (PyObject*) self;
8447 }
8448 else
8449 return PyUnicode_FromUnicode(
8450 PyUnicode_AS_UNICODE(self),
8451 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 }
8454
8455 fill = width - self->length;
8456
8457 u = pad(self, fill, 0, '0');
8458
Walter Dörwald068325e2002-04-15 13:36:47 +00008459 if (u == NULL)
8460 return NULL;
8461
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 if (u->str[fill] == '+' || u->str[fill] == '-') {
8463 /* move sign to beginning of string */
8464 u->str[0] = u->str[fill];
8465 u->str[fill] = '0';
8466 }
8467
8468 return (PyObject*) u;
8469}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
8471#if 0
8472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008473unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474{
Christian Heimes2202f872008-02-06 14:31:34 +00008475 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476}
8477#endif
8478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008479PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008482Return True if S starts with the specified prefix, False otherwise.\n\
8483With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008484With optional end, stop comparing S at that position.\n\
8485prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486
8487static PyObject *
8488unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008491 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008493 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008494 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008495 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008497 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8499 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008500 if (PyTuple_Check(subobj)) {
8501 Py_ssize_t i;
8502 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8503 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008505 if (substring == NULL)
8506 return NULL;
8507 result = tailmatch(self, substring, start, end, -1);
8508 Py_DECREF(substring);
8509 if (result) {
8510 Py_RETURN_TRUE;
8511 }
8512 }
8513 /* nothing matched */
8514 Py_RETURN_FALSE;
8515 }
8516 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008519 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008521 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522}
8523
8524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008525PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008528Return True if S ends with the specified suffix, False otherwise.\n\
8529With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008530With optional end, stop comparing S at that position.\n\
8531suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532
8533static PyObject *
8534unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008537 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008540 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008541 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008543 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8545 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008546 if (PyTuple_Check(subobj)) {
8547 Py_ssize_t i;
8548 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8549 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008551 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008553 result = tailmatch(self, substring, start, end, +1);
8554 Py_DECREF(substring);
8555 if (result) {
8556 Py_RETURN_TRUE;
8557 }
8558 }
8559 Py_RETURN_FALSE;
8560 }
8561 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008565 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008567 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568}
8569
Eric Smith8c663262007-08-25 02:26:07 +00008570#include "stringlib/string_format.h"
8571
8572PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008574\n\
8575");
8576
Eric Smith4a7d76d2008-05-30 18:10:19 +00008577static PyObject *
8578unicode__format__(PyObject* self, PyObject* args)
8579{
8580 PyObject *format_spec;
8581
8582 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8583 return NULL;
8584
8585 return _PyUnicode_FormatAdvanced(self,
8586 PyUnicode_AS_UNICODE(format_spec),
8587 PyUnicode_GET_SIZE(format_spec));
8588}
8589
Eric Smith8c663262007-08-25 02:26:07 +00008590PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008592\n\
8593");
8594
8595static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008596unicode__sizeof__(PyUnicodeObject *v)
8597{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008598 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8599 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008600}
8601
8602PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008604
8605static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008606unicode_getnewargs(PyUnicodeObject *v)
8607{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008608 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008609}
8610
8611
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612static PyMethodDef unicode_methods[] = {
8613
8614 /* Order is according to common usage: often used methods should
8615 appear first, since lookup is done sequentially. */
8616
Benjamin Peterson308d6372009-09-18 21:42:35 +00008617 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008618 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8619 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008620 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008621 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8622 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8623 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8624 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8625 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8626 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8627 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008628 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008629 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8630 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8631 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008632 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008633 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8634 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8635 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008636 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008637 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008638 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008639 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008640 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8641 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8642 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8643 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8644 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8645 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8646 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8647 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8648 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8649 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8650 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8651 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8652 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8653 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008654 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008655 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008656 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008657 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008658 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008659 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8660 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008661 {"maketrans", (PyCFunction) unicode_maketrans,
8662 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008663 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008664#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008665 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666#endif
8667
8668#if 0
8669 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008670 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671#endif
8672
Benjamin Peterson14339b62009-01-31 16:36:08 +00008673 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 {NULL, NULL}
8675};
8676
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008677static PyObject *
8678unicode_mod(PyObject *v, PyObject *w)
8679{
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 if (!PyUnicode_Check(v)) {
8681 Py_INCREF(Py_NotImplemented);
8682 return Py_NotImplemented;
8683 }
8684 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008685}
8686
8687static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008688 0, /*nb_add*/
8689 0, /*nb_subtract*/
8690 0, /*nb_multiply*/
8691 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008692};
8693
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008695 (lenfunc) unicode_length, /* sq_length */
8696 PyUnicode_Concat, /* sq_concat */
8697 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8698 (ssizeargfunc) unicode_getitem, /* sq_item */
8699 0, /* sq_slice */
8700 0, /* sq_ass_item */
8701 0, /* sq_ass_slice */
8702 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703};
8704
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008705static PyObject*
8706unicode_subscript(PyUnicodeObject* self, PyObject* item)
8707{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008708 if (PyIndex_Check(item)) {
8709 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008710 if (i == -1 && PyErr_Occurred())
8711 return NULL;
8712 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008713 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008714 return unicode_getitem(self, i);
8715 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008716 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008717 Py_UNICODE* source_buf;
8718 Py_UNICODE* result_buf;
8719 PyObject* result;
8720
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008721 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008723 return NULL;
8724 }
8725
8726 if (slicelength <= 0) {
8727 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008728 } else if (start == 0 && step == 1 && slicelength == self->length &&
8729 PyUnicode_CheckExact(self)) {
8730 Py_INCREF(self);
8731 return (PyObject *)self;
8732 } else if (step == 1) {
8733 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008734 } else {
8735 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008736 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8737 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008738
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 if (result_buf == NULL)
8740 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008741
8742 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8743 result_buf[i] = source_buf[cur];
8744 }
Tim Petersced69f82003-09-16 20:30:58 +00008745
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008746 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008747 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008748 return result;
8749 }
8750 } else {
8751 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8752 return NULL;
8753 }
8754}
8755
8756static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 (lenfunc)unicode_length, /* mp_length */
8758 (binaryfunc)unicode_subscript, /* mp_subscript */
8759 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008760};
8761
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763/* Helpers for PyUnicode_Format() */
8764
8765static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008766getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008768 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 (*p_argidx)++;
8771 if (arglen < 0)
8772 return args;
8773 else
8774 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 }
8776 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 return NULL;
8779}
8780
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008781/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008783static PyObject *
8784formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008786 char *p;
8787 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008789
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 x = PyFloat_AsDouble(v);
8791 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008792 return NULL;
8793
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008796
Eric Smith0923d1d2009-04-16 20:16:10 +00008797 p = PyOS_double_to_string(x, type, prec,
8798 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008799 if (p == NULL)
8800 return NULL;
8801 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008802 PyMem_Free(p);
8803 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804}
8805
Tim Peters38fd5b62000-09-21 05:43:11 +00008806static PyObject*
8807formatlong(PyObject *val, int flags, int prec, int type)
8808{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008809 char *buf;
8810 int len;
8811 PyObject *str; /* temporary string object. */
8812 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008813
Benjamin Peterson14339b62009-01-31 16:36:08 +00008814 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8815 if (!str)
8816 return NULL;
8817 result = PyUnicode_FromStringAndSize(buf, len);
8818 Py_DECREF(str);
8819 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008820}
8821
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822static int
8823formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008824 size_t buflen,
8825 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008827 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008828 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 if (PyUnicode_GET_SIZE(v) == 1) {
8830 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8831 buf[1] = '\0';
8832 return 1;
8833 }
8834#ifndef Py_UNICODE_WIDE
8835 if (PyUnicode_GET_SIZE(v) == 2) {
8836 /* Decode a valid surrogate pair */
8837 int c0 = PyUnicode_AS_UNICODE(v)[0];
8838 int c1 = PyUnicode_AS_UNICODE(v)[1];
8839 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8840 0xDC00 <= c1 && c1 <= 0xDFFF) {
8841 buf[0] = c0;
8842 buf[1] = c1;
8843 buf[2] = '\0';
8844 return 2;
8845 }
8846 }
8847#endif
8848 goto onError;
8849 }
8850 else {
8851 /* Integer input truncated to a character */
8852 long x;
8853 x = PyLong_AsLong(v);
8854 if (x == -1 && PyErr_Occurred())
8855 goto onError;
8856
8857 if (x < 0 || x > 0x10ffff) {
8858 PyErr_SetString(PyExc_OverflowError,
8859 "%c arg not in range(0x110000)");
8860 return -1;
8861 }
8862
8863#ifndef Py_UNICODE_WIDE
8864 if (x > 0xffff) {
8865 x -= 0x10000;
8866 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8867 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8868 return 2;
8869 }
8870#endif
8871 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008872 buf[1] = '\0';
8873 return 1;
8874 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008875
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008877 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008879 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880}
8881
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008882/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008883 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008884*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008885#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008886
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889{
8890 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008891 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 int args_owned = 0;
8893 PyUnicodeObject *result = NULL;
8894 PyObject *dict = NULL;
8895 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008896
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 PyErr_BadInternalCall();
8899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 }
8901 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008902 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904 fmt = PyUnicode_AS_UNICODE(uformat);
8905 fmtcnt = PyUnicode_GET_SIZE(uformat);
8906
8907 reslen = rescnt = fmtcnt + 100;
8908 result = _PyUnicode_New(reslen);
8909 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 res = PyUnicode_AS_UNICODE(result);
8912
8913 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 arglen = PyTuple_Size(args);
8915 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 }
8917 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 arglen = -1;
8919 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 }
Christian Heimes90aa7642007-12-19 02:45:37 +00008921 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00008922 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924
8925 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 if (*fmt != '%') {
8927 if (--rescnt < 0) {
8928 rescnt = fmtcnt + 100;
8929 reslen += rescnt;
8930 if (_PyUnicode_Resize(&result, reslen) < 0)
8931 goto onError;
8932 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8933 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008934 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008936 }
8937 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 /* Got a format specifier */
8939 int flags = 0;
8940 Py_ssize_t width = -1;
8941 int prec = -1;
8942 Py_UNICODE c = '\0';
8943 Py_UNICODE fill;
8944 int isnumok;
8945 PyObject *v = NULL;
8946 PyObject *temp = NULL;
8947 Py_UNICODE *pbuf;
8948 Py_UNICODE sign;
8949 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008950 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 fmt++;
8953 if (*fmt == '(') {
8954 Py_UNICODE *keystart;
8955 Py_ssize_t keylen;
8956 PyObject *key;
8957 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00008958
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 if (dict == NULL) {
8960 PyErr_SetString(PyExc_TypeError,
8961 "format requires a mapping");
8962 goto onError;
8963 }
8964 ++fmt;
8965 --fmtcnt;
8966 keystart = fmt;
8967 /* Skip over balanced parentheses */
8968 while (pcount > 0 && --fmtcnt >= 0) {
8969 if (*fmt == ')')
8970 --pcount;
8971 else if (*fmt == '(')
8972 ++pcount;
8973 fmt++;
8974 }
8975 keylen = fmt - keystart - 1;
8976 if (fmtcnt < 0 || pcount > 0) {
8977 PyErr_SetString(PyExc_ValueError,
8978 "incomplete format key");
8979 goto onError;
8980 }
8981#if 0
8982 /* keys are converted to strings using UTF-8 and
8983 then looked up since Python uses strings to hold
8984 variables names etc. in its namespaces and we
8985 wouldn't want to break common idioms. */
8986 key = PyUnicode_EncodeUTF8(keystart,
8987 keylen,
8988 NULL);
8989#else
8990 key = PyUnicode_FromUnicode(keystart, keylen);
8991#endif
8992 if (key == NULL)
8993 goto onError;
8994 if (args_owned) {
8995 Py_DECREF(args);
8996 args_owned = 0;
8997 }
8998 args = PyObject_GetItem(dict, key);
8999 Py_DECREF(key);
9000 if (args == NULL) {
9001 goto onError;
9002 }
9003 args_owned = 1;
9004 arglen = -1;
9005 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009006 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 while (--fmtcnt >= 0) {
9008 switch (c = *fmt++) {
9009 case '-': flags |= F_LJUST; continue;
9010 case '+': flags |= F_SIGN; continue;
9011 case ' ': flags |= F_BLANK; continue;
9012 case '#': flags |= F_ALT; continue;
9013 case '0': flags |= F_ZERO; continue;
9014 }
9015 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009016 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 if (c == '*') {
9018 v = getnextarg(args, arglen, &argidx);
9019 if (v == NULL)
9020 goto onError;
9021 if (!PyLong_Check(v)) {
9022 PyErr_SetString(PyExc_TypeError,
9023 "* wants int");
9024 goto onError;
9025 }
9026 width = PyLong_AsLong(v);
9027 if (width == -1 && PyErr_Occurred())
9028 goto onError;
9029 if (width < 0) {
9030 flags |= F_LJUST;
9031 width = -width;
9032 }
9033 if (--fmtcnt >= 0)
9034 c = *fmt++;
9035 }
9036 else if (c >= '0' && c <= '9') {
9037 width = c - '0';
9038 while (--fmtcnt >= 0) {
9039 c = *fmt++;
9040 if (c < '0' || c > '9')
9041 break;
9042 if ((width*10) / 10 != width) {
9043 PyErr_SetString(PyExc_ValueError,
9044 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009045 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 }
9047 width = width*10 + (c - '0');
9048 }
9049 }
9050 if (c == '.') {
9051 prec = 0;
9052 if (--fmtcnt >= 0)
9053 c = *fmt++;
9054 if (c == '*') {
9055 v = getnextarg(args, arglen, &argidx);
9056 if (v == NULL)
9057 goto onError;
9058 if (!PyLong_Check(v)) {
9059 PyErr_SetString(PyExc_TypeError,
9060 "* wants int");
9061 goto onError;
9062 }
9063 prec = PyLong_AsLong(v);
9064 if (prec == -1 && PyErr_Occurred())
9065 goto onError;
9066 if (prec < 0)
9067 prec = 0;
9068 if (--fmtcnt >= 0)
9069 c = *fmt++;
9070 }
9071 else if (c >= '0' && c <= '9') {
9072 prec = c - '0';
9073 while (--fmtcnt >= 0) {
9074 c = Py_CHARMASK(*fmt++);
9075 if (c < '0' || c > '9')
9076 break;
9077 if ((prec*10) / 10 != prec) {
9078 PyErr_SetString(PyExc_ValueError,
9079 "prec too big");
9080 goto onError;
9081 }
9082 prec = prec*10 + (c - '0');
9083 }
9084 }
9085 } /* prec */
9086 if (fmtcnt >= 0) {
9087 if (c == 'h' || c == 'l' || c == 'L') {
9088 if (--fmtcnt >= 0)
9089 c = *fmt++;
9090 }
9091 }
9092 if (fmtcnt < 0) {
9093 PyErr_SetString(PyExc_ValueError,
9094 "incomplete format");
9095 goto onError;
9096 }
9097 if (c != '%') {
9098 v = getnextarg(args, arglen, &argidx);
9099 if (v == NULL)
9100 goto onError;
9101 }
9102 sign = 0;
9103 fill = ' ';
9104 switch (c) {
9105
9106 case '%':
9107 pbuf = formatbuf;
9108 /* presume that buffer length is at least 1 */
9109 pbuf[0] = '%';
9110 len = 1;
9111 break;
9112
9113 case 's':
9114 case 'r':
9115 case 'a':
9116 if (PyUnicode_Check(v) && c == 's') {
9117 temp = v;
9118 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009119 }
9120 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 if (c == 's')
9122 temp = PyObject_Str(v);
9123 else if (c == 'r')
9124 temp = PyObject_Repr(v);
9125 else
9126 temp = PyObject_ASCII(v);
9127 if (temp == NULL)
9128 goto onError;
9129 if (PyUnicode_Check(temp))
9130 /* nothing to do */;
9131 else {
9132 Py_DECREF(temp);
9133 PyErr_SetString(PyExc_TypeError,
9134 "%s argument has non-string str()");
9135 goto onError;
9136 }
9137 }
9138 pbuf = PyUnicode_AS_UNICODE(temp);
9139 len = PyUnicode_GET_SIZE(temp);
9140 if (prec >= 0 && len > prec)
9141 len = prec;
9142 break;
9143
9144 case 'i':
9145 case 'd':
9146 case 'u':
9147 case 'o':
9148 case 'x':
9149 case 'X':
9150 if (c == 'i')
9151 c = 'd';
9152 isnumok = 0;
9153 if (PyNumber_Check(v)) {
9154 PyObject *iobj=NULL;
9155
9156 if (PyLong_Check(v)) {
9157 iobj = v;
9158 Py_INCREF(iobj);
9159 }
9160 else {
9161 iobj = PyNumber_Long(v);
9162 }
9163 if (iobj!=NULL) {
9164 if (PyLong_Check(iobj)) {
9165 isnumok = 1;
9166 temp = formatlong(iobj, flags, prec, c);
9167 Py_DECREF(iobj);
9168 if (!temp)
9169 goto onError;
9170 pbuf = PyUnicode_AS_UNICODE(temp);
9171 len = PyUnicode_GET_SIZE(temp);
9172 sign = 1;
9173 }
9174 else {
9175 Py_DECREF(iobj);
9176 }
9177 }
9178 }
9179 if (!isnumok) {
9180 PyErr_Format(PyExc_TypeError,
9181 "%%%c format: a number is required, "
9182 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9183 goto onError;
9184 }
9185 if (flags & F_ZERO)
9186 fill = '0';
9187 break;
9188
9189 case 'e':
9190 case 'E':
9191 case 'f':
9192 case 'F':
9193 case 'g':
9194 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009195 temp = formatfloat(v, flags, prec, c);
9196 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009198 pbuf = PyUnicode_AS_UNICODE(temp);
9199 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 sign = 1;
9201 if (flags & F_ZERO)
9202 fill = '0';
9203 break;
9204
9205 case 'c':
9206 pbuf = formatbuf;
9207 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9208 if (len < 0)
9209 goto onError;
9210 break;
9211
9212 default:
9213 PyErr_Format(PyExc_ValueError,
9214 "unsupported format character '%c' (0x%x) "
9215 "at index %zd",
9216 (31<=c && c<=126) ? (char)c : '?',
9217 (int)c,
9218 (Py_ssize_t)(fmt - 1 -
9219 PyUnicode_AS_UNICODE(uformat)));
9220 goto onError;
9221 }
9222 if (sign) {
9223 if (*pbuf == '-' || *pbuf == '+') {
9224 sign = *pbuf++;
9225 len--;
9226 }
9227 else if (flags & F_SIGN)
9228 sign = '+';
9229 else if (flags & F_BLANK)
9230 sign = ' ';
9231 else
9232 sign = 0;
9233 }
9234 if (width < len)
9235 width = len;
9236 if (rescnt - (sign != 0) < width) {
9237 reslen -= rescnt;
9238 rescnt = width + fmtcnt + 100;
9239 reslen += rescnt;
9240 if (reslen < 0) {
9241 Py_XDECREF(temp);
9242 PyErr_NoMemory();
9243 goto onError;
9244 }
9245 if (_PyUnicode_Resize(&result, reslen) < 0) {
9246 Py_XDECREF(temp);
9247 goto onError;
9248 }
9249 res = PyUnicode_AS_UNICODE(result)
9250 + reslen - rescnt;
9251 }
9252 if (sign) {
9253 if (fill != ' ')
9254 *res++ = sign;
9255 rescnt--;
9256 if (width > len)
9257 width--;
9258 }
9259 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9260 assert(pbuf[0] == '0');
9261 assert(pbuf[1] == c);
9262 if (fill != ' ') {
9263 *res++ = *pbuf++;
9264 *res++ = *pbuf++;
9265 }
9266 rescnt -= 2;
9267 width -= 2;
9268 if (width < 0)
9269 width = 0;
9270 len -= 2;
9271 }
9272 if (width > len && !(flags & F_LJUST)) {
9273 do {
9274 --rescnt;
9275 *res++ = fill;
9276 } while (--width > len);
9277 }
9278 if (fill == ' ') {
9279 if (sign)
9280 *res++ = sign;
9281 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9282 assert(pbuf[0] == '0');
9283 assert(pbuf[1] == c);
9284 *res++ = *pbuf++;
9285 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009286 }
9287 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 Py_UNICODE_COPY(res, pbuf, len);
9289 res += len;
9290 rescnt -= len;
9291 while (--width >= len) {
9292 --rescnt;
9293 *res++ = ' ';
9294 }
9295 if (dict && (argidx < arglen) && c != '%') {
9296 PyErr_SetString(PyExc_TypeError,
9297 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009298 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 goto onError;
9300 }
9301 Py_XDECREF(temp);
9302 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 } /* until end */
9304 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 PyErr_SetString(PyExc_TypeError,
9306 "not all arguments converted during string formatting");
9307 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 }
9309
Thomas Woutersa96affe2006-03-12 00:29:36 +00009310 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009313 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314 }
9315 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316 return (PyObject *)result;
9317
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 Py_XDECREF(result);
9320 Py_DECREF(uformat);
9321 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323 }
9324 return NULL;
9325}
9326
Jeremy Hylton938ace62002-07-17 16:30:39 +00009327static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009328unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9329
Tim Peters6d6c1a32001-08-02 04:15:00 +00009330static PyObject *
9331unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9332{
Benjamin Peterson29060642009-01-31 22:14:21 +00009333 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009334 static char *kwlist[] = {"object", "encoding", "errors", 0};
9335 char *encoding = NULL;
9336 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009337
Benjamin Peterson14339b62009-01-31 16:36:08 +00009338 if (type != &PyUnicode_Type)
9339 return unicode_subtype_new(type, args, kwds);
9340 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009342 return NULL;
9343 if (x == NULL)
9344 return (PyObject *)_PyUnicode_New(0);
9345 if (encoding == NULL && errors == NULL)
9346 return PyObject_Str(x);
9347 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009349}
9350
Guido van Rossume023fe02001-08-30 03:12:59 +00009351static PyObject *
9352unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9353{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009354 PyUnicodeObject *tmp, *pnew;
9355 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009356
Benjamin Peterson14339b62009-01-31 16:36:08 +00009357 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9358 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9359 if (tmp == NULL)
9360 return NULL;
9361 assert(PyUnicode_Check(tmp));
9362 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9363 if (pnew == NULL) {
9364 Py_DECREF(tmp);
9365 return NULL;
9366 }
9367 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9368 if (pnew->str == NULL) {
9369 _Py_ForgetReference((PyObject *)pnew);
9370 PyObject_Del(pnew);
9371 Py_DECREF(tmp);
9372 return PyErr_NoMemory();
9373 }
9374 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9375 pnew->length = n;
9376 pnew->hash = tmp->hash;
9377 Py_DECREF(tmp);
9378 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009379}
9380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009381PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009383\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009384Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009385encoding defaults to the current default string encoding.\n\
9386errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009387
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009388static PyObject *unicode_iter(PyObject *seq);
9389
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009391 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009392 "str", /* tp_name */
9393 sizeof(PyUnicodeObject), /* tp_size */
9394 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009396 (destructor)unicode_dealloc, /* tp_dealloc */
9397 0, /* tp_print */
9398 0, /* tp_getattr */
9399 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009400 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009401 unicode_repr, /* tp_repr */
9402 &unicode_as_number, /* tp_as_number */
9403 &unicode_as_sequence, /* tp_as_sequence */
9404 &unicode_as_mapping, /* tp_as_mapping */
9405 (hashfunc) unicode_hash, /* tp_hash*/
9406 0, /* tp_call*/
9407 (reprfunc) unicode_str, /* tp_str */
9408 PyObject_GenericGetAttr, /* tp_getattro */
9409 0, /* tp_setattro */
9410 0, /* tp_as_buffer */
9411 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009412 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413 unicode_doc, /* tp_doc */
9414 0, /* tp_traverse */
9415 0, /* tp_clear */
9416 PyUnicode_RichCompare, /* tp_richcompare */
9417 0, /* tp_weaklistoffset */
9418 unicode_iter, /* tp_iter */
9419 0, /* tp_iternext */
9420 unicode_methods, /* tp_methods */
9421 0, /* tp_members */
9422 0, /* tp_getset */
9423 &PyBaseObject_Type, /* tp_base */
9424 0, /* tp_dict */
9425 0, /* tp_descr_get */
9426 0, /* tp_descr_set */
9427 0, /* tp_dictoffset */
9428 0, /* tp_init */
9429 0, /* tp_alloc */
9430 unicode_new, /* tp_new */
9431 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432};
9433
9434/* Initialize the Unicode implementation */
9435
Thomas Wouters78890102000-07-22 19:25:51 +00009436void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009438 int i;
9439
Thomas Wouters477c8d52006-05-27 19:21:47 +00009440 /* XXX - move this array to unicodectype.c ? */
9441 Py_UNICODE linebreak[] = {
9442 0x000A, /* LINE FEED */
9443 0x000D, /* CARRIAGE RETURN */
9444 0x001C, /* FILE SEPARATOR */
9445 0x001D, /* GROUP SEPARATOR */
9446 0x001E, /* RECORD SEPARATOR */
9447 0x0085, /* NEXT LINE */
9448 0x2028, /* LINE SEPARATOR */
9449 0x2029, /* PARAGRAPH SEPARATOR */
9450 };
9451
Fred Drakee4315f52000-05-09 19:53:39 +00009452 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009453 free_list = NULL;
9454 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009456 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009459 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009461 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009463
9464 /* initialize the linebreak bloom filter */
9465 bloom_linebreak = make_bloom_mask(
9466 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9467 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009468
9469 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
9472/* Finalize the Unicode implementation */
9473
Christian Heimesa156e092008-02-16 07:38:31 +00009474int
9475PyUnicode_ClearFreeList(void)
9476{
9477 int freelist_size = numfree;
9478 PyUnicodeObject *u;
9479
9480 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 PyUnicodeObject *v = u;
9482 u = *(PyUnicodeObject **)u;
9483 if (v->str)
9484 PyObject_DEL(v->str);
9485 Py_XDECREF(v->defenc);
9486 PyObject_Del(v);
9487 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009488 }
9489 free_list = NULL;
9490 assert(numfree == 0);
9491 return freelist_size;
9492}
9493
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494void
Thomas Wouters78890102000-07-22 19:25:51 +00009495_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009497 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009499 Py_XDECREF(unicode_empty);
9500 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009501
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009502 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 if (unicode_latin1[i]) {
9504 Py_DECREF(unicode_latin1[i]);
9505 unicode_latin1[i] = NULL;
9506 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009507 }
Christian Heimesa156e092008-02-16 07:38:31 +00009508 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009510
Walter Dörwald16807132007-05-25 13:52:07 +00009511void
9512PyUnicode_InternInPlace(PyObject **p)
9513{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009514 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9515 PyObject *t;
9516 if (s == NULL || !PyUnicode_Check(s))
9517 Py_FatalError(
9518 "PyUnicode_InternInPlace: unicode strings only please!");
9519 /* If it's a subclass, we don't really know what putting
9520 it in the interned dict might do. */
9521 if (!PyUnicode_CheckExact(s))
9522 return;
9523 if (PyUnicode_CHECK_INTERNED(s))
9524 return;
9525 if (interned == NULL) {
9526 interned = PyDict_New();
9527 if (interned == NULL) {
9528 PyErr_Clear(); /* Don't leave an exception */
9529 return;
9530 }
9531 }
9532 /* It might be that the GetItem call fails even
9533 though the key is present in the dictionary,
9534 namely when this happens during a stack overflow. */
9535 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009537 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009538
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 if (t) {
9540 Py_INCREF(t);
9541 Py_DECREF(*p);
9542 *p = t;
9543 return;
9544 }
Walter Dörwald16807132007-05-25 13:52:07 +00009545
Benjamin Peterson14339b62009-01-31 16:36:08 +00009546 PyThreadState_GET()->recursion_critical = 1;
9547 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9548 PyErr_Clear();
9549 PyThreadState_GET()->recursion_critical = 0;
9550 return;
9551 }
9552 PyThreadState_GET()->recursion_critical = 0;
9553 /* The two references in interned are not counted by refcnt.
9554 The deallocator will take care of this */
9555 Py_REFCNT(s) -= 2;
9556 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009557}
9558
9559void
9560PyUnicode_InternImmortal(PyObject **p)
9561{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009562 PyUnicode_InternInPlace(p);
9563 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9564 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9565 Py_INCREF(*p);
9566 }
Walter Dörwald16807132007-05-25 13:52:07 +00009567}
9568
9569PyObject *
9570PyUnicode_InternFromString(const char *cp)
9571{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 PyObject *s = PyUnicode_FromString(cp);
9573 if (s == NULL)
9574 return NULL;
9575 PyUnicode_InternInPlace(&s);
9576 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009577}
9578
9579void _Py_ReleaseInternedUnicodeStrings(void)
9580{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009581 PyObject *keys;
9582 PyUnicodeObject *s;
9583 Py_ssize_t i, n;
9584 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009585
Benjamin Peterson14339b62009-01-31 16:36:08 +00009586 if (interned == NULL || !PyDict_Check(interned))
9587 return;
9588 keys = PyDict_Keys(interned);
9589 if (keys == NULL || !PyList_Check(keys)) {
9590 PyErr_Clear();
9591 return;
9592 }
Walter Dörwald16807132007-05-25 13:52:07 +00009593
Benjamin Peterson14339b62009-01-31 16:36:08 +00009594 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9595 detector, interned unicode strings are not forcibly deallocated;
9596 rather, we give them their stolen references back, and then clear
9597 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009598
Benjamin Peterson14339b62009-01-31 16:36:08 +00009599 n = PyList_GET_SIZE(keys);
9600 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009602 for (i = 0; i < n; i++) {
9603 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9604 switch (s->state) {
9605 case SSTATE_NOT_INTERNED:
9606 /* XXX Shouldn't happen */
9607 break;
9608 case SSTATE_INTERNED_IMMORTAL:
9609 Py_REFCNT(s) += 1;
9610 immortal_size += s->length;
9611 break;
9612 case SSTATE_INTERNED_MORTAL:
9613 Py_REFCNT(s) += 2;
9614 mortal_size += s->length;
9615 break;
9616 default:
9617 Py_FatalError("Inconsistent interned string state.");
9618 }
9619 s->state = SSTATE_NOT_INTERNED;
9620 }
9621 fprintf(stderr, "total size of all interned strings: "
9622 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9623 "mortal/immortal\n", mortal_size, immortal_size);
9624 Py_DECREF(keys);
9625 PyDict_Clear(interned);
9626 Py_DECREF(interned);
9627 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009628}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009629
9630
9631/********************* Unicode Iterator **************************/
9632
9633typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009634 PyObject_HEAD
9635 Py_ssize_t it_index;
9636 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009637} unicodeiterobject;
9638
9639static void
9640unicodeiter_dealloc(unicodeiterobject *it)
9641{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009642 _PyObject_GC_UNTRACK(it);
9643 Py_XDECREF(it->it_seq);
9644 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009645}
9646
9647static int
9648unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9649{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009650 Py_VISIT(it->it_seq);
9651 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009652}
9653
9654static PyObject *
9655unicodeiter_next(unicodeiterobject *it)
9656{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009657 PyUnicodeObject *seq;
9658 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009659
Benjamin Peterson14339b62009-01-31 16:36:08 +00009660 assert(it != NULL);
9661 seq = it->it_seq;
9662 if (seq == NULL)
9663 return NULL;
9664 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009665
Benjamin Peterson14339b62009-01-31 16:36:08 +00009666 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9667 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009668 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009669 if (item != NULL)
9670 ++it->it_index;
9671 return item;
9672 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009673
Benjamin Peterson14339b62009-01-31 16:36:08 +00009674 Py_DECREF(seq);
9675 it->it_seq = NULL;
9676 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009677}
9678
9679static PyObject *
9680unicodeiter_len(unicodeiterobject *it)
9681{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009682 Py_ssize_t len = 0;
9683 if (it->it_seq)
9684 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9685 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009686}
9687
9688PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9689
9690static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009691 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009692 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009693 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009694};
9695
9696PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009697 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9698 "str_iterator", /* tp_name */
9699 sizeof(unicodeiterobject), /* tp_basicsize */
9700 0, /* tp_itemsize */
9701 /* methods */
9702 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9703 0, /* tp_print */
9704 0, /* tp_getattr */
9705 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009706 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009707 0, /* tp_repr */
9708 0, /* tp_as_number */
9709 0, /* tp_as_sequence */
9710 0, /* tp_as_mapping */
9711 0, /* tp_hash */
9712 0, /* tp_call */
9713 0, /* tp_str */
9714 PyObject_GenericGetAttr, /* tp_getattro */
9715 0, /* tp_setattro */
9716 0, /* tp_as_buffer */
9717 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9718 0, /* tp_doc */
9719 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9720 0, /* tp_clear */
9721 0, /* tp_richcompare */
9722 0, /* tp_weaklistoffset */
9723 PyObject_SelfIter, /* tp_iter */
9724 (iternextfunc)unicodeiter_next, /* tp_iternext */
9725 unicodeiter_methods, /* tp_methods */
9726 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009727};
9728
9729static PyObject *
9730unicode_iter(PyObject *seq)
9731{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009732 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009733
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734 if (!PyUnicode_Check(seq)) {
9735 PyErr_BadInternalCall();
9736 return NULL;
9737 }
9738 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9739 if (it == NULL)
9740 return NULL;
9741 it->it_index = 0;
9742 Py_INCREF(seq);
9743 it->it_seq = (PyUnicodeObject *)seq;
9744 _PyObject_GC_TRACK(it);
9745 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009746}
9747
Martin v. Löwis5b222132007-06-10 09:51:05 +00009748size_t
9749Py_UNICODE_strlen(const Py_UNICODE *u)
9750{
9751 int res = 0;
9752 while(*u++)
9753 res++;
9754 return res;
9755}
9756
9757Py_UNICODE*
9758Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9759{
9760 Py_UNICODE *u = s1;
9761 while ((*u++ = *s2++));
9762 return s1;
9763}
9764
9765Py_UNICODE*
9766Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9767{
9768 Py_UNICODE *u = s1;
9769 while ((*u++ = *s2++))
9770 if (n-- == 0)
9771 break;
9772 return s1;
9773}
9774
9775int
9776Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9777{
9778 while (*s1 && *s2 && *s1 == *s2)
9779 s1++, s2++;
9780 if (*s1 && *s2)
9781 return (*s1 < *s2) ? -1 : +1;
9782 if (*s1)
9783 return 1;
9784 if (*s2)
9785 return -1;
9786 return 0;
9787}
9788
9789Py_UNICODE*
9790Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9791{
9792 const Py_UNICODE *p;
9793 for (p = s; *p; p++)
9794 if (*p == c)
9795 return (Py_UNICODE*)p;
9796 return NULL;
9797}
9798
9799
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009800#ifdef __cplusplus
9801}
9802#endif