blob: bc1612dcf6551655ec154d5abc144f080840262a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Fred Drakee4315f52000-05-09 19:53:39 +0000117/* Default encoding to use and assume when NULL is passed as encoding
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000118 parameter; it is fixed to "utf-8". Always use the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000119 PyUnicode_GetDefaultEncoding() API to access this global.
120
Alexandre Vassalotti3d2fd7f2007-10-16 00:26:33 +0000121 Don't forget to alter Py_FileSystemDefaultEncoding if you change the
Guido van Rossum00bc0e02007-10-15 02:52:41 +0000122 hard coded default!
123*/
Guido van Rossumf15a29f2007-05-04 00:41:39 +0000124static const char unicode_default_encoding[] = "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +0000125
Christian Heimes190d79e2008-01-30 11:58:22 +0000126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0009: * HORIZONTAL TABULATION */
130/* case 0x000A: * LINE FEED */
131/* case 0x000B: * VERTICAL TABULATION */
132/* case 0x000C: * FORM FEED */
133/* case 0x000D: * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000134 0, 1, 1, 1, 1, 1, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000136/* case 0x001C: * FILE SEPARATOR */
137/* case 0x001D: * GROUP SEPARATOR */
138/* case 0x001E: * RECORD SEPARATOR */
139/* case 0x001F: * UNIT SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000140 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000141/* case 0x0020: * SPACE */
Christian Heimes190d79e2008-01-30 11:58:22 +0000142 1, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000160/* 0x000A, * LINE FEED */
161/* 0x000D, * CARRIAGE RETURN */
Christian Heimes190d79e2008-01-30 11:58:22 +0000162 0, 0, 1, 0, 0, 1, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x001C, * FILE SEPARATOR */
165/* 0x001D, * GROUP SEPARATOR */
166/* 0x001E, * RECORD SEPARATOR */
Christian Heimes190d79e2008-01-30 11:58:22 +0000167 0, 0, 0, 0, 1, 1, 1, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000185PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000186{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000187#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188 return 0x10FFFF;
189#else
190 /* This is actually an illegal character, so it should
191 not be passed to unichr. */
192 return 0xFFFF;
193#endif
194}
195
Thomas Wouters477c8d52006-05-27 19:21:47 +0000196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199 to keep things simple, we use a single bitmask, using the least 5
200 bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
Christian Heimes190d79e2008-01-30 11:58:22 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
218 long mask;
219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
223 mask |= (1 << (ptr[i] & 0x1F));
224
225 return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
236 return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 if (unicode == unicode_empty ||
259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000263 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Thomas Wouters477c8d52006-05-27 19:21:47 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000295 Ux0000 terminated; some code (e.g. new_identifier)
296 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Thomas Wouters477c8d52006-05-27 19:21:47 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Christian Heimesb186d002008-03-18 15:15:01 +0000329 PyObject_DEL(unicode->str);
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000330 unicode->str = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000331 }
332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Christian Heimesb186d002008-03-18 15:15:01 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
349 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000363 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000364 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000366
367 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000368 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000371 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373}
374
375static
Guido van Rossum9475a232001-10-05 20:51:39 +0000376void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377{
Walter Dörwald16807132007-05-25 13:52:07 +0000378 switch (PyUnicode_CHECK_INTERNED(unicode)) {
379 case SSTATE_NOT_INTERNED:
380 break;
381
382 case SSTATE_INTERNED_MORTAL:
383 /* revive dead object temporarily for DelItem */
Christian Heimes90aa7642007-12-19 02:45:37 +0000384 Py_REFCNT(unicode) = 3;
Walter Dörwald16807132007-05-25 13:52:07 +0000385 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386 Py_FatalError(
Benjamin Peterson142957c2008-07-04 19:55:29 +0000387 "deletion of interned string failed");
Walter Dörwald16807132007-05-25 13:52:07 +0000388 break;
389
390 case SSTATE_INTERNED_IMMORTAL:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000391 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000392
393 default:
Benjamin Peterson142957c2008-07-04 19:55:29 +0000394 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000395 }
396
Guido van Rossum604ddf82001-12-06 20:03:56 +0000397 if (PyUnicode_CheckExact(unicode) &&
Christian Heimes2202f872008-02-06 14:31:34 +0000398 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000399 /* Keep-Alive optimization */
400 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Christian Heimesb186d002008-03-18 15:15:01 +0000401 PyObject_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 unicode->str = NULL;
403 unicode->length = 0;
404 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000405 if (unicode->defenc) {
406 Py_DECREF(unicode->defenc);
407 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000408 }
409 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000410 *(PyUnicodeObject **)unicode = free_list;
411 free_list = unicode;
412 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
414 else {
Christian Heimesb186d002008-03-18 15:15:01 +0000415 PyObject_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000416 Py_XDECREF(unicode->defenc);
Christian Heimes90aa7642007-12-19 02:45:37 +0000417 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 }
419}
420
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000423{
424 register PyUnicodeObject *v;
425
426 /* Argument checks */
427 if (unicode == NULL) {
428 PyErr_BadInternalCall();
429 return -1;
430 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000432 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433 PyErr_BadInternalCall();
434 return -1;
435 }
436
437 /* Resizing unicode_empty and single character objects is not
438 possible since these are being shared. We simply return a fresh
439 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000440 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000441 (v == unicode_empty || v->length == 1)) {
442 PyUnicodeObject *w = _PyUnicode_New(length);
443 if (w == NULL)
444 return -1;
445 Py_UNICODE_COPY(w->str, v->str,
446 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000447 Py_DECREF(*unicode);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000448 *unicode = w;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 return 0;
450 }
451
452 /* Note that we don't have to modify *unicode for unshared Unicode
453 objects, since we can modify them in-place. */
454 return unicode_resize(v, length);
455}
456
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000461
Guido van Rossumd57fd912000-03-10 22:53:23 +0000462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000463 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000464{
465 PyUnicodeObject *unicode;
466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000467 /* If the Unicode data is known at construction time, we can apply
468 some optimizations which share commonly used objects. */
469 if (u != NULL) {
470
471 /* Optimization for empty strings */
472 if (size == 0 && unicode_empty != NULL) {
473 Py_INCREF(unicode_empty);
474 return (PyObject *)unicode_empty;
475 }
476
477 /* Single character Unicode objects in the Latin-1 range are
478 shared when using this constructor */
479 if (size == 1 && *u < 256) {
480 unicode = unicode_latin1[*u];
481 if (!unicode) {
482 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000483 if (!unicode)
484 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000485 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000486 unicode_latin1[*u] = unicode;
487 }
488 Py_INCREF(unicode);
489 return (PyObject *)unicode;
490 }
491 }
Tim Petersced69f82003-09-16 20:30:58 +0000492
Guido van Rossumd57fd912000-03-10 22:53:23 +0000493 unicode = _PyUnicode_New(size);
494 if (!unicode)
495 return NULL;
496
497 /* Copy the Unicode data into the new object */
498 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000499 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500
501 return (PyObject *)unicode;
502}
503
Walter Dörwaldd2034312007-05-18 16:29:38 +0000504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000505{
506 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000507
508 if (size < 0) {
509 PyErr_SetString(PyExc_SystemError,
510 "Negative size passed to PyUnicode_FromStringAndSize");
511 return NULL;
512 }
513
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000514 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000515 some optimizations which share commonly used objects.
516 Also, this means the input must be UTF-8, so fall back to the
517 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518 if (u != NULL) {
519
520 /* Optimization for empty strings */
521 if (size == 0 && unicode_empty != NULL) {
522 Py_INCREF(unicode_empty);
523 return (PyObject *)unicode_empty;
524 }
525
Martin v. Löwis9c121062007-08-05 20:26:11 +0000526 /* Single characters are shared when using this constructor.
527 Restrict to ASCII, since the input must be UTF-8. */
528 if (size == 1 && Py_CHARMASK(*u) < 128) {
Christian Heimesbbe741d2008-03-28 10:53:29 +0000529 unicode = unicode_latin1[Py_CHARMASK(*u)];
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 if (!unicode) {
531 unicode = _PyUnicode_New(1);
532 if (!unicode)
533 return NULL;
Guido van Rossum00058aa2007-07-19 18:21:28 +0000534 unicode->str[0] = Py_CHARMASK(*u);
Christian Heimesbbe741d2008-03-28 10:53:29 +0000535 unicode_latin1[Py_CHARMASK(*u)] = unicode;
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000536 }
537 Py_INCREF(unicode);
538 return (PyObject *)unicode;
539 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000540
541 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000542 }
543
Walter Dörwald55507312007-05-18 13:12:10 +0000544 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000545 if (!unicode)
546 return NULL;
547
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000548 return (PyObject *)unicode;
549}
550
Walter Dörwaldd2034312007-05-18 16:29:38 +0000551PyObject *PyUnicode_FromString(const char *u)
552{
553 size_t size = strlen(u);
554 if (size > PY_SSIZE_T_MAX) {
555 PyErr_SetString(PyExc_OverflowError, "input too long");
556 return NULL;
557 }
558
559 return PyUnicode_FromStringAndSize(u, size);
560}
561
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562#ifdef HAVE_WCHAR_H
563
564PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Martin v. Löwis18e16552006-02-15 17:27:45 +0000565 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566{
567 PyUnicodeObject *unicode;
568
569 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000570 if (size == 0)
571 return PyUnicode_FromStringAndSize(NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572 PyErr_BadInternalCall();
573 return NULL;
574 }
575
Martin v. Löwis790465f2008-04-05 20:41:37 +0000576 if (size == -1) {
577 size = wcslen(w);
578 }
579
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 unicode = _PyUnicode_New(size);
581 if (!unicode)
582 return NULL;
583
584 /* Copy the wchar_t data into the new object */
585#ifdef HAVE_USABLE_WCHAR_T
586 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000587#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 {
589 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000590 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000591 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000592 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000593 *u++ = *w++;
594 }
595#endif
596
597 return (PyObject *)unicode;
598}
599
Walter Dörwald346737f2007-05-31 10:44:43 +0000600static void
601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602{
603 *fmt++ = '%';
604 if (width) {
605 if (zeropad)
606 *fmt++ = '0';
607 fmt += sprintf(fmt, "%d", width);
608 }
609 if (precision)
610 fmt += sprintf(fmt, ".%d", precision);
611 if (longflag)
612 *fmt++ = 'l';
613 else if (size_tflag) {
614 char *f = PY_FORMAT_SIZE_T;
615 while (*f)
616 *fmt++ = *f++;
617 }
618 *fmt++ = c;
619 *fmt = '\0';
620}
621
Walter Dörwaldd2034312007-05-18 16:29:38 +0000622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
623
624PyObject *
625PyUnicode_FromFormatV(const char *format, va_list vargs)
626{
627 va_list count;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000628 Py_ssize_t callcount = 0;
629 PyObject **callresults = NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +0000630 PyObject **callresult = NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000631 Py_ssize_t n = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000632 int width = 0;
633 int precision = 0;
634 int zeropad;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000635 const char* f;
636 Py_UNICODE *s;
637 PyObject *string;
638 /* used by sprintf */
639 char buffer[21];
Walter Dörwald346737f2007-05-31 10:44:43 +0000640 /* use abuffer instead of buffer, if we need more space
641 * (which can happen if there's a format specifier with width). */
642 char *abuffer = NULL;
643 char *realbuffer;
644 Py_ssize_t abuffersize = 0;
645 char fmt[60]; /* should be enough for %0width.precisionld */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000646 const char *copy;
647
648#ifdef VA_LIST_IS_ARRAY
649 Py_MEMCPY(count, vargs, sizeof(va_list));
650#else
651#ifdef __va_copy
652 __va_copy(count, vargs);
653#else
654 count = vargs;
655#endif
656#endif
Georg Brandl559e5d72008-06-11 18:37:52 +0000657 /* step 1: count the number of %S/%R/%A format specifications
658 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
659 * these objects once during step 3 and put the result in
660 an array) */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000661 for (f = format; *f; f++) {
Georg Brandl559e5d72008-06-11 18:37:52 +0000662 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000663 ++callcount;
664 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000665 /* step 2: allocate memory for the results of
Thomas Heller519a0422007-11-15 20:48:54 +0000666 * PyObject_Str()/PyObject_Repr() calls */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000667 if (callcount) {
Christian Heimesb186d002008-03-18 15:15:01 +0000668 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000669 if (!callresults) {
670 PyErr_NoMemory();
671 return NULL;
672 }
673 callresult = callresults;
674 }
675 /* step 3: figure out how large a buffer we need */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000676 for (f = format; *f; f++) {
677 if (*f == '%') {
678 const char* p = f;
Walter Dörwald346737f2007-05-31 10:44:43 +0000679 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000680 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000681 width = (width*10) + *f++ - '0';
Christian Heimesfe337bf2008-03-23 21:54:12 +0000682 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
Walter Dörwaldd2034312007-05-18 16:29:38 +0000683 ;
684
685 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
686 * they don't affect the amount of space we reserve.
687 */
688 if ((*f == 'l' || *f == 'z') &&
689 (f[1] == 'd' || f[1] == 'u'))
Eric Smithddd25822007-08-27 11:33:42 +0000690 ++f;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000691
692 switch (*f) {
693 case 'c':
694 (void)va_arg(count, int);
695 /* fall through... */
696 case '%':
697 n++;
698 break;
699 case 'd': case 'u': case 'i': case 'x':
700 (void) va_arg(count, int);
701 /* 20 bytes is enough to hold a 64-bit
702 integer. Decimal takes the most space.
Walter Dörwald346737f2007-05-31 10:44:43 +0000703 This isn't enough for octal.
704 If a width is specified we need more
705 (which we allocate later). */
706 if (width < 20)
707 width = 20;
708 n += width;
709 if (abuffersize < width)
710 abuffersize = width;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000711 break;
712 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000713 {
714 /* UTF-8 */
715 unsigned char*s;
716 s = va_arg(count, unsigned char*);
717 while (*s) {
718 if (*s < 128) {
719 n++; s++;
720 } else if (*s < 0xc0) {
721 /* invalid UTF-8 */
722 n++; s++;
723 } else if (*s < 0xc0) {
724 n++;
725 s++; if(!*s)break;
726 s++;
727 } else if (*s < 0xe0) {
728 n++;
729 s++; if(!*s)break;
730 s++; if(!*s)break;
731 s++;
732 } else {
733 #ifdef Py_UNICODE_WIDE
734 n++;
735 #else
736 n+=2;
737 #endif
738 s++; if(!*s)break;
739 s++; if(!*s)break;
740 s++; if(!*s)break;
741 s++;
742 }
743 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000744 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000745 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000746 case 'U':
747 {
748 PyObject *obj = va_arg(count, PyObject *);
749 assert(obj && PyUnicode_Check(obj));
750 n += PyUnicode_GET_SIZE(obj);
751 break;
752 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000753 case 'V':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 const char *str = va_arg(count, const char *);
757 assert(obj || str);
758 assert(!obj || PyUnicode_Check(obj));
759 if (obj)
760 n += PyUnicode_GET_SIZE(obj);
761 else
762 n += strlen(str);
763 break;
764 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000765 case 'S':
766 {
767 PyObject *obj = va_arg(count, PyObject *);
768 PyObject *str;
769 assert(obj);
Thomas Heller519a0422007-11-15 20:48:54 +0000770 str = PyObject_Str(obj);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000771 if (!str)
772 goto fail;
773 n += PyUnicode_GET_SIZE(str);
774 /* Remember the str and switch to the next slot */
775 *callresult++ = str;
776 break;
777 }
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000778 case 'R':
779 {
780 PyObject *obj = va_arg(count, PyObject *);
781 PyObject *repr;
782 assert(obj);
783 repr = PyObject_Repr(obj);
784 if (!repr)
785 goto fail;
786 n += PyUnicode_GET_SIZE(repr);
787 /* Remember the repr and switch to the next slot */
788 *callresult++ = repr;
789 break;
790 }
Georg Brandl559e5d72008-06-11 18:37:52 +0000791 case 'A':
792 {
793 PyObject *obj = va_arg(count, PyObject *);
794 PyObject *ascii;
795 assert(obj);
796 ascii = PyObject_ASCII(obj);
797 if (!ascii)
798 goto fail;
799 n += PyUnicode_GET_SIZE(ascii);
800 /* Remember the repr and switch to the next slot */
801 *callresult++ = ascii;
802 break;
803 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000804 case 'p':
805 (void) va_arg(count, int);
806 /* maximum 64-bit pointer representation:
807 * 0xffffffffffffffff
808 * so 19 characters is enough.
809 * XXX I count 18 -- what's the extra for?
810 */
811 n += 19;
812 break;
813 default:
814 /* if we stumble upon an unknown
815 formatting code, copy the rest of
816 the format string to the output
817 string. (we cannot just skip the
818 code, since there's no way to know
819 what's in the argument list) */
820 n += strlen(p);
821 goto expand;
822 }
823 } else
824 n++;
825 }
826 expand:
Walter Dörwald346737f2007-05-31 10:44:43 +0000827 if (abuffersize > 20) {
Christian Heimesb186d002008-03-18 15:15:01 +0000828 abuffer = PyObject_Malloc(abuffersize);
Walter Dörwald346737f2007-05-31 10:44:43 +0000829 if (!abuffer) {
830 PyErr_NoMemory();
831 goto fail;
832 }
833 realbuffer = abuffer;
834 }
835 else
836 realbuffer = buffer;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000837 /* step 4: fill the buffer */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000838 /* Since we've analyzed how much space we need for the worst case,
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000839 we don't have to resize the string.
840 There can be no errors beyond this point. */
Walter Dörwaldd2034312007-05-18 16:29:38 +0000841 string = PyUnicode_FromUnicode(NULL, n);
842 if (!string)
Walter Dörwald346737f2007-05-31 10:44:43 +0000843 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000844
845 s = PyUnicode_AS_UNICODE(string);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000846 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000847
848 for (f = format; *f; f++) {
849 if (*f == '%') {
850 const char* p = f++;
851 int longflag = 0;
852 int size_tflag = 0;
Walter Dörwald346737f2007-05-31 10:44:43 +0000853 zeropad = (*f == '0');
854 /* parse the width.precision part */
855 width = 0;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000856 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000857 width = (width*10) + *f++ - '0';
858 precision = 0;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859 if (*f == '.') {
860 f++;
Christian Heimesfe337bf2008-03-23 21:54:12 +0000861 while (ISDIGIT((unsigned)*f))
Walter Dörwald346737f2007-05-31 10:44:43 +0000862 precision = (precision*10) + *f++ - '0';
Walter Dörwaldd2034312007-05-18 16:29:38 +0000863 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000864 /* handle the long flag, but only for %ld and %lu.
865 others can be added when necessary. */
866 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
867 longflag = 1;
868 ++f;
869 }
870 /* handle the size_t flag. */
871 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
872 size_tflag = 1;
873 ++f;
874 }
875
876 switch (*f) {
877 case 'c':
878 *s++ = va_arg(vargs, int);
879 break;
880 case 'd':
Walter Dörwald346737f2007-05-31 10:44:43 +0000881 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000882 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000883 sprintf(realbuffer, fmt, va_arg(vargs, long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000884 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000885 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000886 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000887 sprintf(realbuffer, fmt, va_arg(vargs, int));
888 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000889 break;
890 case 'u':
Walter Dörwald346737f2007-05-31 10:44:43 +0000891 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
Walter Dörwaldd2034312007-05-18 16:29:38 +0000892 if (longflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000893 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000894 else if (size_tflag)
Walter Dörwald346737f2007-05-31 10:44:43 +0000895 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000896 else
Walter Dörwald346737f2007-05-31 10:44:43 +0000897 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
898 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000899 break;
900 case 'i':
Walter Dörwald346737f2007-05-31 10:44:43 +0000901 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
902 sprintf(realbuffer, fmt, va_arg(vargs, int));
903 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000904 break;
905 case 'x':
Walter Dörwald346737f2007-05-31 10:44:43 +0000906 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000909 break;
910 case 's':
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000911 {
912 /* Parameter must be UTF-8 encoded.
913 In case of encoding errors, use
914 the replacement character. */
915 PyObject *u;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000916 p = va_arg(vargs, char*);
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000917 u = PyUnicode_DecodeUTF8(p, strlen(p),
918 "replace");
919 if (!u)
920 goto fail;
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
922 PyUnicode_GET_SIZE(u));
923 s += PyUnicode_GET_SIZE(u);
924 Py_DECREF(u);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000925 break;
Martin v. Löwis90d1fcd2007-08-31 11:01:23 +0000926 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000927 case 'U':
928 {
929 PyObject *obj = va_arg(vargs, PyObject *);
Walter Dörwald5c2fab62007-05-24 19:51:02 +0000930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000933 break;
934 }
Walter Dörwaldd7fb7642007-06-11 16:36:59 +0000935 case 'V':
936 {
937 PyObject *obj = va_arg(vargs, PyObject *);
938 const char *str = va_arg(vargs, const char *);
939 if (obj) {
940 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942 s += size;
943 } else {
944 appendstring(str);
945 }
946 break;
947 }
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000948 case 'S':
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000949 case 'R':
950 {
Guido van Rossum755114a2007-06-13 01:04:27 +0000951 Py_UNICODE *ucopy;
952 Py_ssize_t usize;
953 Py_ssize_t upos;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000954 /* unused, since we already have the result */
955 (void) va_arg(vargs, PyObject *);
Guido van Rossum755114a2007-06-13 01:04:27 +0000956 ucopy = PyUnicode_AS_UNICODE(*callresult);
957 usize = PyUnicode_GET_SIZE(*callresult);
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000958 for (upos = 0; upos<usize;)
959 *s++ = ucopy[upos++];
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000960 /* We're done with the unicode()/repr() => forget it */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000961 Py_DECREF(*callresult);
Walter Dörwald1be7e3f2007-05-23 21:02:42 +0000962 /* switch to next unicode()/repr() result */
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000963 ++callresult;
964 break;
965 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966 case 'p':
967 sprintf(buffer, "%p", va_arg(vargs, void*));
968 /* %p is ill-defined: ensure leading 0x. */
969 if (buffer[1] == 'X')
970 buffer[1] = 'x';
971 else if (buffer[1] != 'x') {
972 memmove(buffer+2, buffer, strlen(buffer)+1);
973 buffer[0] = '0';
974 buffer[1] = 'x';
975 }
976 appendstring(buffer);
977 break;
978 case '%':
979 *s++ = '%';
980 break;
981 default:
982 appendstring(p);
983 goto end;
984 }
985 } else
986 *s++ = *f;
987 }
988
989 end:
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000990 if (callresults)
Christian Heimesb186d002008-03-18 15:15:01 +0000991 PyObject_Free(callresults);
Walter Dörwald346737f2007-05-31 10:44:43 +0000992 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +0000993 PyObject_Free(abuffer);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000994 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995 return string;
Walter Dörwald7569dfe2007-05-19 21:49:49 +0000996 fail:
997 if (callresults) {
998 PyObject **callresult2 = callresults;
Guido van Rossum307fa8c2007-07-16 20:46:27 +0000999 while (callresult2 < callresult) {
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001000 Py_DECREF(*callresult2);
1001 ++callresult2;
1002 }
Christian Heimesb186d002008-03-18 15:15:01 +00001003 PyObject_Free(callresults);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001004 }
Walter Dörwald346737f2007-05-31 10:44:43 +00001005 if (abuffer)
Christian Heimesb186d002008-03-18 15:15:01 +00001006 PyObject_Free(abuffer);
Walter Dörwald7569dfe2007-05-19 21:49:49 +00001007 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001008}
1009
1010#undef appendstring
1011
1012PyObject *
1013PyUnicode_FromFormat(const char *format, ...)
1014{
1015 PyObject* ret;
1016 va_list vargs;
1017
1018#ifdef HAVE_STDARG_PROTOTYPES
1019 va_start(vargs, format);
1020#else
1021 va_start(vargs);
1022#endif
1023 ret = PyUnicode_FromFormatV(format, vargs);
1024 va_end(vargs);
1025 return ret;
1026}
1027
Martin v. Löwis18e16552006-02-15 17:27:45 +00001028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1029 wchar_t *w,
1030 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031{
1032 if (unicode == NULL) {
1033 PyErr_BadInternalCall();
1034 return -1;
1035 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001036
1037 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001039 size = PyUnicode_GET_SIZE(unicode) + 1;
1040
Guido van Rossumd57fd912000-03-10 22:53:23 +00001041#ifdef HAVE_USABLE_WCHAR_T
1042 memcpy(w, unicode->str, size * sizeof(wchar_t));
1043#else
1044 {
1045 register Py_UNICODE *u;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001046 register Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +00001048 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 *w++ = *u++;
1050 }
1051#endif
1052
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001053 if (size > PyUnicode_GET_SIZE(unicode))
1054 return PyUnicode_GET_SIZE(unicode);
1055 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 return size;
1057}
1058
1059#endif
1060
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061PyObject *PyUnicode_FromOrdinal(int ordinal)
1062{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001063 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001064
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065 if (ordinal < 0 || ordinal > 0x10ffff) {
1066 PyErr_SetString(PyExc_ValueError,
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001067 "chr() arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 return NULL;
1069 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001070
1071#ifndef Py_UNICODE_WIDE
1072 if (ordinal > 0xffff) {
1073 ordinal -= 0x10000;
1074 s[0] = 0xD800 | (ordinal >> 10);
1075 s[1] = 0xDC00 | (ordinal & 0x3FF);
1076 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001077 }
1078#endif
1079
Hye-Shik Chang40574832004-04-06 07:24:51 +00001080 s[0] = (Py_UNICODE)ordinal;
1081 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001082}
1083
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084PyObject *PyUnicode_FromObject(register PyObject *obj)
1085{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001086 /* XXX Perhaps we should make this API an alias of
Thomas Heller519a0422007-11-15 20:48:54 +00001087 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 if (PyUnicode_CheckExact(obj)) {
1089 Py_INCREF(obj);
1090 return obj;
1091 }
1092 if (PyUnicode_Check(obj)) {
1093 /* For a Unicode subtype that's not a Unicode object,
1094 return a true Unicode object with the same data. */
1095 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1096 PyUnicode_GET_SIZE(obj));
1097 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001098 PyErr_Format(PyExc_TypeError,
1099 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001100 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001101 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001102}
1103
1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1105 const char *encoding,
1106 const char *errors)
1107{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001108 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001109 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001110 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001111
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (obj == NULL) {
1113 PyErr_BadInternalCall();
1114 return NULL;
1115 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001116
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001117 if (PyUnicode_Check(obj)) {
1118 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001119 "decoding str is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001120 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001121 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122
1123 /* Coerce object */
Christian Heimes72b710a2008-05-26 13:28:38 +00001124 if (PyBytes_Check(obj)) {
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001125 s = PyBytes_AS_STRING(obj);
1126 len = PyBytes_GET_SIZE(obj);
1127 }
1128 else if (PyByteArray_Check(obj)) {
1129 s = PyByteArray_AS_STRING(obj);
1130 len = PyByteArray_GET_SIZE(obj);
1131 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001132 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1133 /* Overwrite the error message with something more useful in
1134 case of a TypeError. */
1135 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001136 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001137 "coercing to str: need string or buffer, "
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001138 "%.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00001139 Py_TYPE(obj)->tp_name);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001140 goto onError;
1141 }
Tim Petersced69f82003-09-16 20:30:58 +00001142
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001143 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144 if (len == 0) {
1145 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001146 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147 }
Tim Petersced69f82003-09-16 20:30:58 +00001148 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001149 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001150
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001151 return v;
1152
1153 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155}
1156
1157PyObject *PyUnicode_Decode(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001158 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 const char *encoding,
1160 const char *errors)
1161{
1162 PyObject *buffer = NULL, *unicode;
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001163 Py_buffer info;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001164 char lower[20]; /* Enough for any encoding name we recognize */
1165 char *l;
1166 const char *e;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167
1168 if (encoding == NULL)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001169 encoding = PyUnicode_GetDefaultEncoding();
1170
1171 /* Convert encoding to lower case and replace '_' with '-' in order to
1172 catch e.g. UTF_8 */
1173 e = encoding;
1174 l = lower;
1175 while (*e && l < &lower[(sizeof lower) - 2]) {
1176 if (ISUPPER(*e)) {
1177 *l++ = TOLOWER(*e++);
1178 }
1179 else if (*e == '_') {
1180 *l++ = '-';
1181 e++;
1182 }
1183 else {
1184 *l++ = *e++;
1185 }
1186 }
1187 *l = '\0';
Fred Drakee4315f52000-05-09 19:53:39 +00001188
1189 /* Shortcuts for common default encodings */
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001190 if (strcmp(lower, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 return PyUnicode_DecodeUTF8(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001192 else if ((strcmp(lower, "latin-1") == 0) ||
1193 (strcmp(lower, "iso-8859-1") == 0))
Fred Drakee4315f52000-05-09 19:53:39 +00001194 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001196 else if (strcmp(lower, "mbcs") == 0)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001197 return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001199 else if (strcmp(lower, "ascii") == 0)
Fred Drakee4315f52000-05-09 19:53:39 +00001200 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001201 else if (strcmp(lower, "utf-16") == 0)
1202 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1203 else if (strcmp(lower, "utf-32") == 0)
1204 return PyUnicode_DecodeUTF32(s, size, errors, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001207 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001208 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001209 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001210 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 if (buffer == NULL)
1212 goto onError;
1213 unicode = PyCodec_Decode(buffer, encoding, errors);
1214 if (unicode == NULL)
1215 goto onError;
1216 if (!PyUnicode_Check(unicode)) {
1217 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001218 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001219 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 Py_DECREF(unicode);
1221 goto onError;
1222 }
1223 Py_DECREF(buffer);
1224 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001225
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 onError:
1227 Py_XDECREF(buffer);
1228 return NULL;
1229}
1230
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1232 const char *encoding,
1233 const char *errors)
1234{
1235 PyObject *v;
1236
1237 if (!PyUnicode_Check(unicode)) {
1238 PyErr_BadArgument();
1239 goto onError;
1240 }
1241
1242 if (encoding == NULL)
1243 encoding = PyUnicode_GetDefaultEncoding();
1244
1245 /* Decode via the codec registry */
1246 v = PyCodec_Decode(unicode, encoding, errors);
1247 if (v == NULL)
1248 goto onError;
1249 return v;
1250
1251 onError:
1252 return NULL;
1253}
1254
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1256 const char *encoding,
1257 const char *errors)
1258{
1259 PyObject *v;
1260
1261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_BadArgument();
1263 goto onError;
1264 }
1265
1266 if (encoding == NULL)
1267 encoding = PyUnicode_GetDefaultEncoding();
1268
1269 /* Decode via the codec registry */
1270 v = PyCodec_Decode(unicode, encoding, errors);
1271 if (v == NULL)
1272 goto onError;
1273 if (!PyUnicode_Check(v)) {
1274 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001275 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001276 Py_TYPE(v)->tp_name);
1277 Py_DECREF(v);
1278 goto onError;
1279 }
1280 return v;
1281
1282 onError:
1283 return NULL;
1284}
1285
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001287 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 const char *encoding,
1289 const char *errors)
1290{
1291 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 unicode = PyUnicode_FromUnicode(s, size);
1294 if (unicode == NULL)
1295 return NULL;
1296 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1297 Py_DECREF(unicode);
1298 return v;
1299}
1300
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1302 const char *encoding,
1303 const char *errors)
1304{
1305 PyObject *v;
1306
1307 if (!PyUnicode_Check(unicode)) {
1308 PyErr_BadArgument();
1309 goto onError;
1310 }
1311
1312 if (encoding == NULL)
1313 encoding = PyUnicode_GetDefaultEncoding();
1314
1315 /* Encode via the codec registry */
1316 v = PyCodec_Encode(unicode, encoding, errors);
1317 if (v == NULL)
1318 goto onError;
1319 return v;
1320
1321 onError:
1322 return NULL;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1326 const char *encoding,
1327 const char *errors)
1328{
1329 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 if (!PyUnicode_Check(unicode)) {
1332 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 }
Fred Drakee4315f52000-05-09 19:53:39 +00001335
Tim Petersced69f82003-09-16 20:30:58 +00001336 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +00001337 encoding = PyUnicode_GetDefaultEncoding();
1338
1339 /* Shortcuts for common default encodings */
1340 if (errors == NULL) {
1341 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +00001342 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001343 else if (strcmp(encoding, "latin-1") == 0)
1344 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1346 else if (strcmp(encoding, "mbcs") == 0)
1347 return PyUnicode_AsMBCSString(unicode);
1348#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001349 else if (strcmp(encoding, "ascii") == 0)
1350 return PyUnicode_AsASCIIString(unicode);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001351 /* During bootstrap, we may need to find the encodings
1352 package, to load the file system encoding, and require the
1353 file system encoding in order to load the encodings
1354 package.
1355
1356 Break out of this dependency by assuming that the path to
1357 the encodings module is ASCII-only. XXX could try wcstombs
1358 instead, if the file system encoding is the locale's
1359 encoding. */
1360 else if (Py_FileSystemDefaultEncoding &&
1361 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1362 !PyThreadState_GET()->interp->codecs_initialized)
1363 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
1366 /* Encode via the codec registry */
1367 v = PyCodec_Encode(unicode, encoding, errors);
1368 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001369 return NULL;
1370
1371 /* The normal path */
1372 if (PyBytes_Check(v))
1373 return v;
1374
1375 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001376 if (PyByteArray_Check(v)) {
1377 char msg[100];
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001378 PyObject *b;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001379 PyOS_snprintf(msg, sizeof(msg),
1380 "encoder %s returned buffer instead of bytes",
1381 encoding);
1382 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001383 Py_DECREF(v);
1384 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001385 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001386
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001387 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1388 Py_DECREF(v);
1389 return b;
1390 }
1391
1392 PyErr_Format(PyExc_TypeError,
1393 "encoder did not return a bytes object (type=%.400s)",
1394 Py_TYPE(v)->tp_name);
1395 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001396 return NULL;
1397}
1398
1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1400 const char *encoding,
1401 const char *errors)
1402{
1403 PyObject *v;
1404
1405 if (!PyUnicode_Check(unicode)) {
1406 PyErr_BadArgument();
1407 goto onError;
1408 }
1409
1410 if (encoding == NULL)
1411 encoding = PyUnicode_GetDefaultEncoding();
1412
1413 /* Encode via the codec registry */
1414 v = PyCodec_Encode(unicode, encoding, errors);
1415 if (v == NULL)
1416 goto onError;
1417 if (!PyUnicode_Check(v)) {
1418 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001419 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001420 Py_TYPE(v)->tp_name);
1421 Py_DECREF(v);
1422 goto onError;
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001425
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 onError:
1427 return NULL;
1428}
1429
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1431 const char *errors)
1432{
1433 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001434 if (v)
1435 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001436 if (errors != NULL)
1437 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001438 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001439 PyUnicode_GET_SIZE(unicode),
1440 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001441 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001442 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001443 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001444 return v;
1445}
1446
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001447PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001448PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001449 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001450 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1451}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001452
Christian Heimes5894ba72007-11-04 11:43:14 +00001453PyObject*
1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1455{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001456 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1457 can be undefined. If it is case, decode using UTF-8. The following assumes
1458 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1459 bootstrapping process where the codecs aren't ready yet.
1460 */
1461 if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001464 return PyUnicode_DecodeMBCS(s, size, "replace");
1465 }
1466#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001467 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001468 return PyUnicode_DecodeUTF8(s, size, "replace");
1469 }
1470#endif
1471 return PyUnicode_Decode(s, size,
1472 Py_FileSystemDefaultEncoding,
1473 "replace");
1474 }
1475 else {
1476 return PyUnicode_DecodeUTF8(s, size, "replace");
1477 }
1478}
1479
Martin v. Löwis5b222132007-06-10 09:51:05 +00001480char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001482{
Christian Heimesf3863112007-11-22 07:46:41 +00001483 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001484 if (!PyUnicode_Check(unicode)) {
1485 PyErr_BadArgument();
1486 return NULL;
1487 }
Christian Heimesf3863112007-11-22 07:46:41 +00001488 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1489 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001490 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001491 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001492 *psize = PyBytes_GET_SIZE(bytes);
1493 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001494}
1495
1496char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001497_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001498{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001499 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001500}
1501
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1503{
1504 if (!PyUnicode_Check(unicode)) {
1505 PyErr_BadArgument();
1506 goto onError;
1507 }
1508 return PyUnicode_AS_UNICODE(unicode);
1509
1510 onError:
1511 return NULL;
1512}
1513
Martin v. Löwis18e16552006-02-15 17:27:45 +00001514Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515{
1516 if (!PyUnicode_Check(unicode)) {
1517 PyErr_BadArgument();
1518 goto onError;
1519 }
1520 return PyUnicode_GET_SIZE(unicode);
1521
1522 onError:
1523 return -1;
1524}
1525
Thomas Wouters78890102000-07-22 19:25:51 +00001526const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001527{
1528 return unicode_default_encoding;
1529}
1530
1531int PyUnicode_SetDefaultEncoding(const char *encoding)
1532{
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001533 if (strcmp(encoding, unicode_default_encoding) != 0) {
1534 PyErr_Format(PyExc_ValueError,
1535 "Can only set default encoding to %s",
1536 unicode_default_encoding);
1537 return -1;
1538 }
Fred Drakee4315f52000-05-09 19:53:39 +00001539 return 0;
Fred Drakee4315f52000-05-09 19:53:39 +00001540}
1541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001542/* error handling callback helper:
1543 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001544 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001545 and adjust various state variables.
1546 return 0 on success, -1 on error
1547*/
1548
1549static
1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1551 const char *encoding, const char *reason,
Walter Dörwalda651d3d2007-08-30 15:29:21 +00001552 const char **input, const char **inend, Py_ssize_t *startinpos,
1553 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001554 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001555{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001556 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001557
1558 PyObject *restuple = NULL;
1559 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001560 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001561 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001562 Py_ssize_t requiredsize;
1563 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001564 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001565 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001566 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001567 int res = -1;
1568
1569 if (*errorHandler == NULL) {
1570 *errorHandler = PyCodec_LookupError(errors);
1571 if (*errorHandler == NULL)
1572 goto onError;
1573 }
1574
1575 if (*exceptionObject == NULL) {
1576 *exceptionObject = PyUnicodeDecodeError_Create(
Walter Dörwalde78178e2007-07-30 13:31:40 +00001577 encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001578 if (*exceptionObject == NULL)
1579 goto onError;
1580 }
1581 else {
1582 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1583 goto onError;
1584 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1585 goto onError;
1586 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1587 goto onError;
1588 }
1589
1590 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1591 if (restuple == NULL)
1592 goto onError;
1593 if (!PyTuple_Check(restuple)) {
1594 PyErr_Format(PyExc_TypeError, &argparse[4]);
1595 goto onError;
1596 }
1597 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1598 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001599
1600 /* Copy back the bytes variables, which might have been modified by the
1601 callback */
1602 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1603 if (!inputobj)
1604 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001605 if (!PyBytes_Check(inputobj)) {
Walter Dörwalde78178e2007-07-30 13:31:40 +00001606 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1607 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001608 *input = PyBytes_AS_STRING(inputobj);
1609 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001610 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001611 /* we can DECREF safely, as the exception has another reference,
1612 so the object won't go away. */
1613 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001615 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001616 newpos = insize+newpos;
1617 if (newpos<0 || newpos>insize) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00001618 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001619 goto onError;
1620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001621
1622 /* need more space? (at least enough for what we
1623 have+the replacement+the rest of the string (starting
1624 at the new input position), so we won't have to check space
1625 when there are no errors in the rest of the string) */
1626 repptr = PyUnicode_AS_UNICODE(repunicode);
1627 repsize = PyUnicode_GET_SIZE(repunicode);
1628 requiredsize = *outpos + repsize + insize-newpos;
1629 if (requiredsize > outsize) {
1630 if (requiredsize<2*outsize)
1631 requiredsize = 2*outsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632 if (_PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001633 goto onError;
1634 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1635 }
1636 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001637 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001638 Py_UNICODE_COPY(*outptr, repptr, repsize);
1639 *outptr += repsize;
1640 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001642 /* we made it! */
1643 res = 0;
1644
1645 onError:
1646 Py_XDECREF(restuple);
1647 return res;
1648}
1649
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001650/* --- UTF-7 Codec -------------------------------------------------------- */
1651
1652/* see RFC2152 for details */
1653
Tim Petersced69f82003-09-16 20:30:58 +00001654static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001655char utf7_special[128] = {
1656 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1657 encoded:
1658 0 - not special
1659 1 - special
1660 2 - whitespace (optional)
1661 3 - RFC2152 Set O (optional) */
1662 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1663 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1666 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1668 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1670
1671};
1672
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001673/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1674 warnings about the comparison always being false; since
1675 utf7_special[0] is 1, we can safely make that one comparison
1676 true */
1677
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001678#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001679 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001680 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001681 (encodeO && (utf7_special[(c)] == 3)))
1682
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001683#define B64(n) \
1684 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1685#define B64CHAR(c) \
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001686 (ISALNUM(c) || (c) == '+' || (c) == '/')
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001687#define UB64(c) \
1688 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1689 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001691#define ENCODE(out, ch, bits) \
1692 while (bits >= 6) { \
1693 *out++ = B64(ch >> (bits-6)); \
1694 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001695 }
1696
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001697#define DECODE(out, ch, bits, surrogate) \
1698 while (bits >= 16) { \
1699 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1700 bits -= 16; \
1701 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001702 /* We have already generated an error for the high surrogate \
1703 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001704 surrogate = 0; \
1705 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001706 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001707 it in a 16-bit character */ \
1708 surrogate = 1; \
1709 errmsg = "code pairs are not supported"; \
1710 goto utf7Error; \
1711 } else { \
1712 *out++ = outCh; \
1713 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001714 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001715
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001716PyObject *PyUnicode_DecodeUTF7(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001717 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 const char *errors)
1719{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001720 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1721}
1722
1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1724 Py_ssize_t size,
1725 const char *errors,
1726 Py_ssize_t *consumed)
1727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001729 Py_ssize_t startinpos;
1730 Py_ssize_t endinpos;
1731 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 const char *e;
1733 PyUnicodeObject *unicode;
1734 Py_UNICODE *p;
1735 const char *errmsg = "";
1736 int inShift = 0;
1737 unsigned int bitsleft = 0;
1738 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001739 int surrogate = 0;
1740 PyObject *errorHandler = NULL;
1741 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742
1743 unicode = _PyUnicode_New(size);
1744 if (!unicode)
1745 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001746 if (size == 0) {
1747 if (consumed)
1748 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001751
1752 p = unicode->str;
1753 e = s + size;
1754
1755 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001756 Py_UNICODE ch;
1757 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00001758 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759
1760 if (inShift) {
1761 if ((ch == '-') || !B64CHAR(ch)) {
1762 inShift = 0;
1763 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1766 if (bitsleft >= 6) {
1767 /* The shift sequence has a partial character in it. If
1768 bitsleft < 6 then we could just classify it as padding
1769 but that is not the case here */
1770
1771 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001772 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 }
1774 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001775 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001776 here so indicate the potential of a misencoded character. */
1777
1778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1780 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001781 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001782 }
1783
1784 if (ch == '-') {
1785 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001786 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001787 inShift = 1;
1788 }
1789 } else if (SPECIAL(ch,0,0)) {
1790 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +00001791 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001792 } else {
1793 *p++ = ch;
1794 }
1795 } else {
1796 charsleft = (charsleft << 6) | UB64(ch);
1797 bitsleft += 6;
1798 s++;
1799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1800 }
1801 }
1802 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001803 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001804 s++;
1805 if (s < e && *s == '-') {
1806 s++;
1807 *p++ = '+';
1808 } else
1809 {
1810 inShift = 1;
1811 bitsleft = 0;
1812 }
1813 }
1814 else if (SPECIAL(ch,0,0)) {
Walter Dörwald2b65c752007-08-30 15:35:26 +00001815 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001816 errmsg = "unexpected special character";
1817 s++;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001818 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001819 }
1820 else {
1821 *p++ = ch;
1822 s++;
1823 }
1824 continue;
1825 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 outpos = p-PyUnicode_AS_UNICODE(unicode);
1827 endinpos = s-starts;
1828 if (unicode_decode_call_errorhandler(
1829 errors, &errorHandler,
1830 "utf7", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00001831 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001832 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001833 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001834 }
1835
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001836 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001837 outpos = p-PyUnicode_AS_UNICODE(unicode);
1838 endinpos = size;
1839 if (unicode_decode_call_errorhandler(
1840 errors, &errorHandler,
1841 "utf7", "unterminated shift sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00001842 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001843 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845 if (s < e)
1846 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001848 if (consumed) {
1849 if(inShift)
1850 *consumed = startinpos;
1851 else
1852 *consumed = s-starts;
1853 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001854
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001855 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001856 goto onError;
1857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001860 return (PyObject *)unicode;
1861
1862onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001863 Py_XDECREF(errorHandler);
1864 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001865 Py_DECREF(unicode);
1866 return NULL;
1867}
1868
1869
1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001871 Py_ssize_t size,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001872 int encodeSetO,
1873 int encodeWhiteSpace,
1874 const char *errors)
1875{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001876 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001877 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001878 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001879 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001880 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001881 unsigned int bitsleft = 0;
1882 unsigned long charsleft = 0;
1883 char * out;
1884 char * start;
1885
1886 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00001887 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001888
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001889 if (cbAllocated / 5 != size)
1890 return PyErr_NoMemory();
1891
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001892 v = PyBytes_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001893 if (v == NULL)
1894 return NULL;
1895
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001896 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001897 for (;i < size; ++i) {
1898 Py_UNICODE ch = s[i];
1899
1900 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001901 if (ch == '+') {
1902 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001903 *out++ = '-';
1904 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1905 charsleft = ch;
1906 bitsleft = 16;
1907 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001908 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001909 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001910 } else {
1911 *out++ = (char) ch;
1912 }
1913 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1915 *out++ = B64(charsleft << (6-bitsleft));
1916 charsleft = 0;
1917 bitsleft = 0;
1918 /* Characters not in the BASE64 set implicitly unshift the sequence
1919 so no '-' is required, except if the character is itself a '-' */
1920 if (B64CHAR(ch) || ch == '-') {
1921 *out++ = '-';
1922 }
1923 inShift = 0;
1924 *out++ = (char) ch;
1925 } else {
1926 bitsleft += 16;
1927 charsleft = (charsleft << 16) | ch;
1928 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1929
1930 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001931 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001932 or '-' then the shift sequence will be terminated implicitly and we
1933 don't have to insert a '-'. */
1934
1935 if (bitsleft == 0) {
1936 if (i + 1 < size) {
1937 Py_UNICODE ch2 = s[i+1];
1938
1939 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001940
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001941 } else if (B64CHAR(ch2) || ch2 == '-') {
1942 *out++ = '-';
1943 inShift = 0;
1944 } else {
1945 inShift = 0;
1946 }
1947
1948 }
1949 else {
1950 *out++ = '-';
1951 inShift = 0;
1952 }
1953 }
Tim Petersced69f82003-09-16 20:30:58 +00001954 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001955 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001956 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001957 if (bitsleft) {
1958 *out++= B64(charsleft << (6-bitsleft) );
1959 *out++ = '-';
1960 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00001961 if (_PyBytes_Resize(&v, out - start) < 0)
1962 return NULL;
1963 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001964}
1965
1966#undef SPECIAL
1967#undef B64
1968#undef B64CHAR
1969#undef UB64
1970#undef ENCODE
1971#undef DECODE
1972
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973/* --- UTF-8 Codec -------------------------------------------------------- */
1974
Tim Petersced69f82003-09-16 20:30:58 +00001975static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976char utf8_code_length[256] = {
1977 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1978 illegal prefix. see RFC 2279 for details */
1979 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1987 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1993 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1994 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1995};
1996
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997PyObject *PyUnicode_DecodeUTF8(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00001998 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 const char *errors)
2000{
Walter Dörwald69652032004-09-07 20:24:22 +00002001 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2002}
2003
Antoine Pitrouab868312009-01-10 15:40:25 +00002004/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2005#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2006
2007/* Mask to quickly check whether a C 'long' contains a
2008 non-ASCII, UTF8-encoded char. */
2009#if (SIZEOF_LONG == 8)
2010# define ASCII_CHAR_MASK 0x8080808080808080L
2011#elif (SIZEOF_LONG == 4)
2012# define ASCII_CHAR_MASK 0x80808080L
2013#else
2014# error C 'long' size should be either 4 or 8!
2015#endif
2016
Walter Dörwald69652032004-09-07 20:24:22 +00002017PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002018 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002019 const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002020 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002021{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002022 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002023 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002024 Py_ssize_t startinpos;
2025 Py_ssize_t endinpos;
2026 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002027 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 PyUnicodeObject *unicode;
2029 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002030 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 PyObject *errorHandler = NULL;
2032 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
2034 /* Note: size will always be longer than the resulting Unicode
2035 character count */
2036 unicode = _PyUnicode_New(size);
2037 if (!unicode)
2038 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002039 if (size == 0) {
2040 if (consumed)
2041 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* Unpack UTF-8 encoded data */
2046 p = unicode->str;
2047 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002048 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049
2050 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002051 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052
2053 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002054 /* Fast path for runs of ASCII characters. Given that common UTF-8
2055 input will consist of an overwhelming majority of ASCII
2056 characters, we try to optimize for this case by checking
2057 as many characters as a C 'long' can contain.
2058 First, check if we can do an aligned read, as most CPUs have
2059 a penalty for unaligned reads.
2060 */
2061 if (!((size_t) s & LONG_PTR_MASK)) {
2062 /* Help register allocation */
2063 register const char *_s = s;
2064 register Py_UNICODE *_p = p;
2065 while (_s < aligned_end) {
2066 /* Read a whole long at a time (either 4 or 8 bytes),
2067 and do a fast unrolled copy if it only contains ASCII
2068 characters. */
2069 unsigned long data = *(unsigned long *) _s;
2070 if (data & ASCII_CHAR_MASK)
2071 break;
2072 _p[0] = (unsigned char) _s[0];
2073 _p[1] = (unsigned char) _s[1];
2074 _p[2] = (unsigned char) _s[2];
2075 _p[3] = (unsigned char) _s[3];
2076#if (SIZEOF_LONG == 8)
2077 _p[4] = (unsigned char) _s[4];
2078 _p[5] = (unsigned char) _s[5];
2079 _p[6] = (unsigned char) _s[6];
2080 _p[7] = (unsigned char) _s[7];
2081#endif
2082 _s += SIZEOF_LONG;
2083 _p += SIZEOF_LONG;
2084 }
2085 s = _s;
2086 p = _p;
2087 if (s == e)
2088 break;
2089 ch = (unsigned char)*s;
2090 }
2091 }
2092
2093 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002094 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 s++;
2096 continue;
2097 }
2098
2099 n = utf8_code_length[ch];
2100
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002101 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00002102 if (consumed)
2103 break;
2104 else {
2105 errmsg = "unexpected end of data";
2106 startinpos = s-starts;
2107 endinpos = size;
2108 goto utf8Error;
2109 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111
2112 switch (n) {
2113
2114 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002115 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002116 startinpos = s-starts;
2117 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002118 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119
2120 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002121 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 startinpos = s-starts;
2123 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002124 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125
2126 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002127 if ((s[1] & 0xc0) != 0x80) {
2128 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 startinpos = s-starts;
2130 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002131 goto utf8Error;
2132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002134 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 startinpos = s-starts;
2136 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002137 errmsg = "illegal encoding";
2138 goto utf8Error;
2139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002141 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 break;
2143
2144 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00002145 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002146 (s[2] & 0xc0) != 0x80) {
2147 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002148 startinpos = s-starts;
2149 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002150 goto utf8Error;
2151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002153 if (ch < 0x0800) {
2154 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00002155 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002156
2157 XXX For wide builds (UCS-4) we should probably try
2158 to recombine the surrogates into a single code
2159 unit.
2160 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002161 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 startinpos = s-starts;
2163 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002164 goto utf8Error;
2165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002167 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002168 break;
2169
2170 case 4:
2171 if ((s[1] & 0xc0) != 0x80 ||
2172 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002173 (s[3] & 0xc0) != 0x80) {
2174 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002175 startinpos = s-starts;
2176 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002177 goto utf8Error;
2178 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002179 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2180 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2181 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002182 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002183 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002184 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002185 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002186 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002187 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002188 startinpos = s-starts;
2189 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002190 goto utf8Error;
2191 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002192#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002193 *p++ = (Py_UNICODE)ch;
2194#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002195 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002196
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002197 /* translate from 10000..10FFFF to 0..FFFF */
2198 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002199
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002200 /* high surrogate = top 10 bits added to D800 */
2201 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002202
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002203 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002204 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002205#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 break;
2207
2208 default:
2209 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002210 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002211 startinpos = s-starts;
2212 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002213 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 }
2215 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002216 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002217
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002218 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002219 outpos = p-PyUnicode_AS_UNICODE(unicode);
2220 if (unicode_decode_call_errorhandler(
2221 errors, &errorHandler,
2222 "utf8", errmsg,
Walter Dörwalde78178e2007-07-30 13:31:40 +00002223 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002224 &unicode, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002225 goto onError;
Antoine Pitrouab868312009-01-10 15:40:25 +00002226 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227 }
Walter Dörwald69652032004-09-07 20:24:22 +00002228 if (consumed)
2229 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230
2231 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002232 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 goto onError;
2234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 Py_XDECREF(errorHandler);
2236 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002237 return (PyObject *)unicode;
2238
2239onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002240 Py_XDECREF(errorHandler);
2241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 Py_DECREF(unicode);
2243 return NULL;
2244}
2245
Antoine Pitrouab868312009-01-10 15:40:25 +00002246#undef ASCII_CHAR_MASK
2247
2248
Tim Peters602f7402002-04-27 18:03:26 +00002249/* Allocation strategy: if the string is short, convert into a stack buffer
2250 and allocate exactly as much space needed at the end. Else allocate the
2251 maximum possible needed (4 result bytes per Unicode character), and return
2252 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002253*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002254PyObject *
2255PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002256 Py_ssize_t size,
Tim Peters7e3d9612002-04-21 03:26:37 +00002257 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258{
Tim Peters602f7402002-04-27 18:03:26 +00002259#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002260
Guido van Rossum98297ee2007-11-06 21:34:58 +00002261 Py_ssize_t i; /* index into s of next input byte */
2262 PyObject *result; /* result string object */
2263 char *p; /* next free byte in output buffer */
2264 Py_ssize_t nallocated; /* number of result bytes allocated */
2265 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002266 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002267
Tim Peters602f7402002-04-27 18:03:26 +00002268 assert(s != NULL);
2269 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270
Tim Peters602f7402002-04-27 18:03:26 +00002271 if (size <= MAX_SHORT_UNICHARS) {
2272 /* Write into the stack buffer; nallocated can't overflow.
2273 * At the end, we'll allocate exactly as much heap space as it
2274 * turns out we need.
2275 */
2276 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002277 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002278 p = stackbuf;
2279 }
2280 else {
2281 /* Overallocate on the heap, and give the excess back at the end. */
2282 nallocated = size * 4;
2283 if (nallocated / 4 != size) /* overflow! */
2284 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002285 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002286 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002287 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002288 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002289 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002290
Tim Peters602f7402002-04-27 18:03:26 +00002291 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002292 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002293
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002294 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002295 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002297
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002299 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002300 *p++ = (char)(0xc0 | (ch >> 6));
2301 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002302 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002303 else {
Tim Peters602f7402002-04-27 18:03:26 +00002304 /* Encode UCS2 Unicode ordinals */
2305 if (ch < 0x10000) {
2306 /* Special case: check for high surrogate */
2307 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2308 Py_UCS4 ch2 = s[i];
2309 /* Check for low surrogate and combine the two to
2310 form a UCS4 value */
2311 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002312 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002313 i++;
2314 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002315 }
Tim Peters602f7402002-04-27 18:03:26 +00002316 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002317 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002318 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002319 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2320 *p++ = (char)(0x80 | (ch & 0x3f));
2321 continue;
2322 }
2323encodeUCS4:
2324 /* Encode UCS4 Unicode ordinals */
2325 *p++ = (char)(0xf0 | (ch >> 18));
2326 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2327 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2328 *p++ = (char)(0x80 | (ch & 0x3f));
2329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002331
Guido van Rossum98297ee2007-11-06 21:34:58 +00002332 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002333 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002334 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002335 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002336 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002337 }
2338 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002339 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002340 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002341 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002342 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002343 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002344 return result;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002345
Tim Peters602f7402002-04-27 18:03:26 +00002346#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347}
2348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2350{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 if (!PyUnicode_Check(unicode)) {
2352 PyErr_BadArgument();
2353 return NULL;
2354 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002355 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2356 PyUnicode_GET_SIZE(unicode),
2357 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358}
2359
Walter Dörwald41980ca2007-08-16 21:55:45 +00002360/* --- UTF-32 Codec ------------------------------------------------------- */
2361
2362PyObject *
2363PyUnicode_DecodeUTF32(const char *s,
2364 Py_ssize_t size,
2365 const char *errors,
2366 int *byteorder)
2367{
2368 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2369}
2370
2371PyObject *
2372PyUnicode_DecodeUTF32Stateful(const char *s,
2373 Py_ssize_t size,
2374 const char *errors,
2375 int *byteorder,
2376 Py_ssize_t *consumed)
2377{
2378 const char *starts = s;
2379 Py_ssize_t startinpos;
2380 Py_ssize_t endinpos;
2381 Py_ssize_t outpos;
2382 PyUnicodeObject *unicode;
2383 Py_UNICODE *p;
2384#ifndef Py_UNICODE_WIDE
2385 int i, pairs;
2386#else
2387 const int pairs = 0;
2388#endif
2389 const unsigned char *q, *e;
2390 int bo = 0; /* assume native ordering by default */
2391 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002392 /* Offsets from q for retrieving bytes in the right order. */
2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2394 int iorder[] = {0, 1, 2, 3};
2395#else
2396 int iorder[] = {3, 2, 1, 0};
2397#endif
2398 PyObject *errorHandler = NULL;
2399 PyObject *exc = NULL;
Guido van Rossum8d991ed2007-08-17 15:41:00 +00002400 /* On narrow builds we split characters outside the BMP into two
2401 codepoints => count how much extra space we need. */
2402#ifndef Py_UNICODE_WIDE
2403 for (i = pairs = 0; i < size/4; i++)
2404 if (((Py_UCS4 *)s)[i] >= 0x10000)
2405 pairs++;
2406#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00002407
2408 /* This might be one to much, because of a BOM */
2409 unicode = _PyUnicode_New((size+3)/4+pairs);
2410 if (!unicode)
2411 return NULL;
2412 if (size == 0)
2413 return (PyObject *)unicode;
2414
2415 /* Unpack UTF-32 encoded data */
2416 p = unicode->str;
2417 q = (unsigned char *)s;
2418 e = q + size;
2419
2420 if (byteorder)
2421 bo = *byteorder;
2422
2423 /* Check for BOM marks (U+FEFF) in the input and adjust current
2424 byte order setting accordingly. In native mode, the leading BOM
2425 mark is skipped, in all other modes, it is copied to the output
2426 stream as-is (giving a ZWNBSP character). */
2427 if (bo == 0) {
2428 if (size >= 4) {
2429 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2430 (q[iorder[1]] << 8) | q[iorder[0]];
2431#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2432 if (bom == 0x0000FEFF) {
2433 q += 4;
2434 bo = -1;
2435 }
2436 else if (bom == 0xFFFE0000) {
2437 q += 4;
2438 bo = 1;
2439 }
2440#else
2441 if (bom == 0x0000FEFF) {
2442 q += 4;
2443 bo = 1;
2444 }
2445 else if (bom == 0xFFFE0000) {
2446 q += 4;
2447 bo = -1;
2448 }
2449#endif
2450 }
2451 }
2452
2453 if (bo == -1) {
2454 /* force LE */
2455 iorder[0] = 0;
2456 iorder[1] = 1;
2457 iorder[2] = 2;
2458 iorder[3] = 3;
2459 }
2460 else if (bo == 1) {
2461 /* force BE */
2462 iorder[0] = 3;
2463 iorder[1] = 2;
2464 iorder[2] = 1;
2465 iorder[3] = 0;
2466 }
2467
2468 while (q < e) {
2469 Py_UCS4 ch;
2470 /* remaining bytes at the end? (size should be divisible by 4) */
2471 if (e-q<4) {
2472 if (consumed)
2473 break;
2474 errmsg = "truncated data";
2475 startinpos = ((const char *)q)-starts;
2476 endinpos = ((const char *)e)-starts;
2477 goto utf32Error;
2478 /* The remaining input chars are ignored if the callback
2479 chooses to skip the input */
2480 }
2481 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2482 (q[iorder[1]] << 8) | q[iorder[0]];
2483
2484 if (ch >= 0x110000)
2485 {
2486 errmsg = "codepoint not in range(0x110000)";
2487 startinpos = ((const char *)q)-starts;
2488 endinpos = startinpos+4;
2489 goto utf32Error;
2490 }
2491#ifndef Py_UNICODE_WIDE
2492 if (ch >= 0x10000)
2493 {
2494 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2495 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2496 }
2497 else
2498#endif
2499 *p++ = ch;
2500 q += 4;
2501 continue;
2502 utf32Error:
2503 outpos = p-PyUnicode_AS_UNICODE(unicode);
2504 if (unicode_decode_call_errorhandler(
2505 errors, &errorHandler,
2506 "utf32", errmsg,
2507 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002508 &unicode, &outpos, &p))
Walter Dörwald41980ca2007-08-16 21:55:45 +00002509 goto onError;
2510 }
2511
2512 if (byteorder)
2513 *byteorder = bo;
2514
2515 if (consumed)
2516 *consumed = (const char *)q-starts;
2517
2518 /* Adjust length */
2519 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2520 goto onError;
2521
2522 Py_XDECREF(errorHandler);
2523 Py_XDECREF(exc);
2524 return (PyObject *)unicode;
2525
2526onError:
2527 Py_DECREF(unicode);
2528 Py_XDECREF(errorHandler);
2529 Py_XDECREF(exc);
2530 return NULL;
2531}
2532
2533PyObject *
2534PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2535 Py_ssize_t size,
2536 const char *errors,
2537 int byteorder)
2538{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002539 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002540 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002541 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002542#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002543 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002544#else
2545 const int pairs = 0;
2546#endif
2547 /* Offsets from p for storing byte pairs in the right order. */
2548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2549 int iorder[] = {0, 1, 2, 3};
2550#else
2551 int iorder[] = {3, 2, 1, 0};
2552#endif
2553
2554#define STORECHAR(CH) \
2555 do { \
2556 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2557 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2558 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2559 p[iorder[0]] = (CH) & 0xff; \
2560 p += 4; \
2561 } while(0)
2562
2563 /* In narrow builds we can output surrogate pairs as one codepoint,
2564 so we need less space. */
2565#ifndef Py_UNICODE_WIDE
2566 for (i = pairs = 0; i < size-1; i++)
2567 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2568 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2569 pairs++;
2570#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002571 nsize = (size - pairs + (byteorder == 0));
2572 bytesize = nsize * 4;
2573 if (bytesize / 4 != nsize)
2574 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002575 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002576 if (v == NULL)
2577 return NULL;
2578
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002579 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002580 if (byteorder == 0)
2581 STORECHAR(0xFEFF);
2582 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002583 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002584
2585 if (byteorder == -1) {
2586 /* force LE */
2587 iorder[0] = 0;
2588 iorder[1] = 1;
2589 iorder[2] = 2;
2590 iorder[3] = 3;
2591 }
2592 else if (byteorder == 1) {
2593 /* force BE */
2594 iorder[0] = 3;
2595 iorder[1] = 2;
2596 iorder[2] = 1;
2597 iorder[3] = 0;
2598 }
2599
2600 while (size-- > 0) {
2601 Py_UCS4 ch = *s++;
2602#ifndef Py_UNICODE_WIDE
2603 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2604 Py_UCS4 ch2 = *s;
2605 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2606 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2607 s++;
2608 size--;
2609 }
2610 }
2611#endif
2612 STORECHAR(ch);
2613 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002614
2615 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002616 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002617#undef STORECHAR
2618}
2619
2620PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2621{
2622 if (!PyUnicode_Check(unicode)) {
2623 PyErr_BadArgument();
2624 return NULL;
2625 }
2626 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2627 PyUnicode_GET_SIZE(unicode),
2628 NULL,
2629 0);
2630}
2631
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632/* --- UTF-16 Codec ------------------------------------------------------- */
2633
Tim Peters772747b2001-08-09 22:21:55 +00002634PyObject *
2635PyUnicode_DecodeUTF16(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002636 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002637 const char *errors,
2638 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639{
Walter Dörwald69652032004-09-07 20:24:22 +00002640 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2641}
2642
Antoine Pitrouab868312009-01-10 15:40:25 +00002643/* Two masks for fast checking of whether a C 'long' may contain
2644 UTF16-encoded surrogate characters. This is an efficient heuristic,
2645 assuming that non-surrogate characters with a code point >= 0x8000 are
2646 rare in most input.
2647 FAST_CHAR_MASK is used when the input is in native byte ordering,
2648 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2649 */
2650#if (SIZEOF_LONG == 8)
2651# define FAST_CHAR_MASK 0x8000800080008000L
2652# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2653#elif (SIZEOF_LONG == 4)
2654# define FAST_CHAR_MASK 0x80008000L
2655# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2656#else
2657# error C 'long' size should be either 4 or 8!
2658#endif
2659
Walter Dörwald69652032004-09-07 20:24:22 +00002660PyObject *
2661PyUnicode_DecodeUTF16Stateful(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002662 Py_ssize_t size,
Walter Dörwald69652032004-09-07 20:24:22 +00002663 const char *errors,
2664 int *byteorder,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002665 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002666{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002667 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002668 Py_ssize_t startinpos;
2669 Py_ssize_t endinpos;
2670 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 PyUnicodeObject *unicode;
2672 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00002673 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00002674 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00002675 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002676 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002677 /* Offsets from q for retrieving byte pairs in the right order. */
2678#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2679 int ihi = 1, ilo = 0;
2680#else
2681 int ihi = 0, ilo = 1;
2682#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002683 PyObject *errorHandler = NULL;
2684 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685
2686 /* Note: size will always be longer than the resulting Unicode
2687 character count */
2688 unicode = _PyUnicode_New(size);
2689 if (!unicode)
2690 return NULL;
2691 if (size == 0)
2692 return (PyObject *)unicode;
2693
2694 /* Unpack UTF-16 encoded data */
2695 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002696 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00002697 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
2699 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002700 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002702 /* Check for BOM marks (U+FEFF) in the input and adjust current
2703 byte order setting accordingly. In native mode, the leading BOM
2704 mark is skipped, in all other modes, it is copied to the output
2705 stream as-is (giving a ZWNBSP character). */
2706 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002707 if (size >= 2) {
2708 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002709#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00002710 if (bom == 0xFEFF) {
2711 q += 2;
2712 bo = -1;
2713 }
2714 else if (bom == 0xFFFE) {
2715 q += 2;
2716 bo = 1;
2717 }
Tim Petersced69f82003-09-16 20:30:58 +00002718#else
Walter Dörwald69652032004-09-07 20:24:22 +00002719 if (bom == 0xFEFF) {
2720 q += 2;
2721 bo = 1;
2722 }
2723 else if (bom == 0xFFFE) {
2724 q += 2;
2725 bo = -1;
2726 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002727#endif
Walter Dörwald69652032004-09-07 20:24:22 +00002728 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730
Tim Peters772747b2001-08-09 22:21:55 +00002731 if (bo == -1) {
2732 /* force LE */
2733 ihi = 1;
2734 ilo = 0;
2735 }
2736 else if (bo == 1) {
2737 /* force BE */
2738 ihi = 0;
2739 ilo = 1;
2740 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002741#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742 native_ordering = ilo < ihi;
2743#else
2744 native_ordering = ilo > ihi;
2745#endif
Tim Peters772747b2001-08-09 22:21:55 +00002746
Antoine Pitrouab868312009-01-10 15:40:25 +00002747 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00002748 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00002750 /* First check for possible aligned read of a C 'long'. Unaligned
2751 reads are more expensive, better to defer to another iteration. */
2752 if (!((size_t) q & LONG_PTR_MASK)) {
2753 /* Fast path for runs of non-surrogate chars. */
2754 register const unsigned char *_q = q;
2755 Py_UNICODE *_p = p;
2756 if (native_ordering) {
2757 /* Native ordering is simple: as long as the input cannot
2758 possibly contain a surrogate char, do an unrolled copy
2759 of several 16-bit code points to the target object.
2760 The non-surrogate check is done on several input bytes
2761 at a time (as many as a C 'long' can contain). */
2762 while (_q < aligned_end) {
2763 unsigned long data = * (unsigned long *) _q;
2764 if (data & FAST_CHAR_MASK)
2765 break;
2766 _p[0] = ((unsigned short *) _q)[0];
2767 _p[1] = ((unsigned short *) _q)[1];
2768#if (SIZEOF_LONG == 8)
2769 _p[2] = ((unsigned short *) _q)[2];
2770 _p[3] = ((unsigned short *) _q)[3];
2771#endif
2772 _q += SIZEOF_LONG;
2773 _p += SIZEOF_LONG / 2;
2774 }
2775 }
2776 else {
2777 /* Byteswapped ordering is similar, but we must decompose
2778 the copy bytewise, and take care of zero'ing out the
2779 upper bytes if the target object is in 32-bit units
2780 (that is, in UCS-4 builds). */
2781 while (_q < aligned_end) {
2782 unsigned long data = * (unsigned long *) _q;
2783 if (data & SWAPPED_FAST_CHAR_MASK)
2784 break;
2785 /* Zero upper bytes in UCS-4 builds */
2786#if (Py_UNICODE_SIZE > 2)
2787 _p[0] = 0;
2788 _p[1] = 0;
2789#if (SIZEOF_LONG == 8)
2790 _p[2] = 0;
2791 _p[3] = 0;
2792#endif
2793#endif
2794 ((unsigned char *) _p)[1] = _q[0];
2795 ((unsigned char *) _p)[0] = _q[1];
2796 ((unsigned char *) _p)[1 + Py_UNICODE_SIZE] = _q[2];
2797 ((unsigned char *) _p)[0 + Py_UNICODE_SIZE] = _q[3];
2798#if (SIZEOF_LONG == 8)
2799 ((unsigned char *) _p)[1 + 2 * Py_UNICODE_SIZE] = _q[4];
2800 ((unsigned char *) _p)[0 + 2 * Py_UNICODE_SIZE] = _q[5];
2801 ((unsigned char *) _p)[1 + 3 * Py_UNICODE_SIZE] = _q[6];
2802 ((unsigned char *) _p)[0 + 3 * Py_UNICODE_SIZE] = _q[7];
2803#endif
2804 _q += SIZEOF_LONG;
2805 _p += SIZEOF_LONG / 2;
2806 }
2807 }
2808 p = _p;
2809 q = _q;
2810 if (q >= e)
2811 break;
2812 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 ch = (q[ihi] << 8) | q[ilo];
2814
Tim Peters772747b2001-08-09 22:21:55 +00002815 q += 2;
2816
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817 if (ch < 0xD800 || ch > 0xDFFF) {
2818 *p++ = ch;
2819 continue;
2820 }
2821
2822 /* UTF-16 code pair: */
Antoine Pitrouab868312009-01-10 15:40:25 +00002823 if (q > e) {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002824 errmsg = "unexpected end of data";
Antoine Pitrouab868312009-01-10 15:40:25 +00002825 startinpos = (((const char *)q) - 2) - starts;
2826 endinpos = ((const char *)e) + 1 - starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002827 goto utf16Error;
2828 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002829 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00002830 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2831 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00002832 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002833#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002834 *p++ = ch;
2835 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002836#else
2837 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002838#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002839 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002840 }
2841 else {
2842 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002843 startinpos = (((const char *)q)-4)-starts;
2844 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002845 goto utf16Error;
2846 }
2847
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002849 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002850 startinpos = (((const char *)q)-2)-starts;
2851 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002852 /* Fall through to report the error */
2853
2854 utf16Error:
Antoine Pitrouab868312009-01-10 15:40:25 +00002855 outpos = p - PyUnicode_AS_UNICODE(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002856 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00002857 errors,
2858 &errorHandler,
2859 "utf16", errmsg,
2860 &starts,
2861 (const char **)&e,
2862 &startinpos,
2863 &endinpos,
2864 &exc,
2865 (const char **)&q,
2866 &unicode,
2867 &outpos,
2868 &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 }
Antoine Pitrouab868312009-01-10 15:40:25 +00002871 /* remaining byte at the end? (size should be even) */
2872 if (e == q) {
2873 if (!consumed) {
2874 errmsg = "truncated data";
2875 startinpos = ((const char *)q) - starts;
2876 endinpos = ((const char *)e) + 1 - starts;
2877 outpos = p - PyUnicode_AS_UNICODE(unicode);
2878 if (unicode_decode_call_errorhandler(
2879 errors,
2880 &errorHandler,
2881 "utf16", errmsg,
2882 &starts,
2883 (const char **)&e,
2884 &startinpos,
2885 &endinpos,
2886 &exc,
2887 (const char **)&q,
2888 &unicode,
2889 &outpos,
2890 &p))
2891 goto onError;
2892 /* The remaining input chars are ignored if the callback
2893 chooses to skip the input */
2894 }
2895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896
2897 if (byteorder)
2898 *byteorder = bo;
2899
Walter Dörwald69652032004-09-07 20:24:22 +00002900 if (consumed)
2901 *consumed = (const char *)q-starts;
2902
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002904 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 goto onError;
2906
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 Py_XDECREF(errorHandler);
2908 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 return (PyObject *)unicode;
2910
2911onError:
2912 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002913 Py_XDECREF(errorHandler);
2914 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 return NULL;
2916}
2917
Antoine Pitrouab868312009-01-10 15:40:25 +00002918#undef FAST_CHAR_MASK
2919#undef SWAPPED_FAST_CHAR_MASK
2920
Tim Peters772747b2001-08-09 22:21:55 +00002921PyObject *
2922PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002923 Py_ssize_t size,
Tim Peters772747b2001-08-09 22:21:55 +00002924 const char *errors,
2925 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002927 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002928 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002929 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002930#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002931 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002932#else
2933 const int pairs = 0;
2934#endif
Tim Peters772747b2001-08-09 22:21:55 +00002935 /* Offsets from p for storing byte pairs in the right order. */
2936#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2937 int ihi = 1, ilo = 0;
2938#else
2939 int ihi = 0, ilo = 1;
2940#endif
2941
2942#define STORECHAR(CH) \
2943 do { \
2944 p[ihi] = ((CH) >> 8) & 0xff; \
2945 p[ilo] = (CH) & 0xff; \
2946 p += 2; \
2947 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002949#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002950 for (i = pairs = 0; i < size; i++)
2951 if (s[i] >= 0x10000)
2952 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002953#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002954 /* 2 * (size + pairs + (byteorder == 0)) */
2955 if (size > PY_SSIZE_T_MAX ||
2956 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2957 return PyErr_NoMemory();
2958 nsize = size + pairs + (byteorder == 0);
2959 bytesize = nsize * 2;
2960 if (bytesize / 2 != nsize)
2961 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002962 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 if (v == NULL)
2964 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002966 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00002968 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002969 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002970 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00002971
2972 if (byteorder == -1) {
2973 /* force LE */
2974 ihi = 1;
2975 ilo = 0;
2976 }
2977 else if (byteorder == 1) {
2978 /* force BE */
2979 ihi = 0;
2980 ilo = 1;
2981 }
2982
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002983 while (size-- > 0) {
2984 Py_UNICODE ch = *s++;
2985 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002986#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002987 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00002988 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2989 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002991#endif
Tim Peters772747b2001-08-09 22:21:55 +00002992 STORECHAR(ch);
2993 if (ch2)
2994 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002995 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002996
2997 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002998 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002999#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000}
3001
3002PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3003{
3004 if (!PyUnicode_Check(unicode)) {
3005 PyErr_BadArgument();
3006 return NULL;
3007 }
3008 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3009 PyUnicode_GET_SIZE(unicode),
3010 NULL,
3011 0);
3012}
3013
3014/* --- Unicode Escape Codec ----------------------------------------------- */
3015
Fredrik Lundh06d12682001-01-24 07:59:11 +00003016static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003017
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003019 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 const char *errors)
3021{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003023 Py_ssize_t startinpos;
3024 Py_ssize_t endinpos;
3025 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003028 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003030 char* message;
3031 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003032 PyObject *errorHandler = NULL;
3033 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003034
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 /* Escaped strings will always be longer than the resulting
3036 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003037 length after conversion to the true value.
3038 (but if the error callback returns a long replacement string
3039 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 v = _PyUnicode_New(size);
3041 if (v == NULL)
3042 goto onError;
3043 if (size == 0)
3044 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003046 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003048
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 while (s < end) {
3050 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003051 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053
3054 /* Non-escape characters are interpreted as Unicode ordinals */
3055 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003056 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 continue;
3058 }
3059
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 /* \ - Escapes */
3062 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003063 c = *s++;
3064 if (s > end)
3065 c = '\0'; /* Invalid after \ */
3066 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067
3068 /* \x escapes */
3069 case '\n': break;
3070 case '\\': *p++ = '\\'; break;
3071 case '\'': *p++ = '\''; break;
3072 case '\"': *p++ = '\"'; break;
3073 case 'b': *p++ = '\b'; break;
3074 case 'f': *p++ = '\014'; break; /* FF */
3075 case 't': *p++ = '\t'; break;
3076 case 'n': *p++ = '\n'; break;
3077 case 'r': *p++ = '\r'; break;
3078 case 'v': *p++ = '\013'; break; /* VT */
3079 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3080
3081 /* \OOO (octal) escapes */
3082 case '0': case '1': case '2': case '3':
3083 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003084 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003085 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003086 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003087 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003088 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003090 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 break;
3092
Fredrik Lundhccc74732001-02-18 22:13:49 +00003093 /* hex escapes */
3094 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003096 digits = 2;
3097 message = "truncated \\xXX escape";
3098 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099
Fredrik Lundhccc74732001-02-18 22:13:49 +00003100 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003102 digits = 4;
3103 message = "truncated \\uXXXX escape";
3104 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105
Fredrik Lundhccc74732001-02-18 22:13:49 +00003106 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003107 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003108 digits = 8;
3109 message = "truncated \\UXXXXXXXX escape";
3110 hexescape:
3111 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003112 outpos = p-PyUnicode_AS_UNICODE(v);
3113 if (s+digits>end) {
3114 endinpos = size;
3115 if (unicode_decode_call_errorhandler(
3116 errors, &errorHandler,
3117 "unicodeescape", "end of string in escape sequence",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003118 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003119 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003120 goto onError;
3121 goto nextByte;
3122 }
3123 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003124 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003125 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003126 endinpos = (s+i+1)-starts;
3127 if (unicode_decode_call_errorhandler(
3128 errors, &errorHandler,
3129 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003130 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003131 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003133 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003134 }
3135 chr = (chr<<4) & ~0xF;
3136 if (c >= '0' && c <= '9')
3137 chr += c - '0';
3138 else if (c >= 'a' && c <= 'f')
3139 chr += 10 + c - 'a';
3140 else
3141 chr += 10 + c - 'A';
3142 }
3143 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003144 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003145 /* _decoding_error will have already written into the
3146 target buffer. */
3147 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003148 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003149 /* when we get here, chr is a 32-bit unicode character */
3150 if (chr <= 0xffff)
3151 /* UCS-2 character */
3152 *p++ = (Py_UNICODE) chr;
3153 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003154 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003155 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003156#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003157 *p++ = chr;
3158#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003159 chr -= 0x10000L;
3160 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003161 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003162#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003163 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 endinpos = s-starts;
3165 outpos = p-PyUnicode_AS_UNICODE(v);
3166 if (unicode_decode_call_errorhandler(
3167 errors, &errorHandler,
3168 "unicodeescape", "illegal Unicode character",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003169 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003170 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003171 goto onError;
3172 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003173 break;
3174
3175 /* \N{name} */
3176 case 'N':
3177 message = "malformed \\N character escape";
3178 if (ucnhash_CAPI == NULL) {
3179 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003180 PyObject *m, *api;
Christian Heimes072c0f12008-01-03 23:01:04 +00003181 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003182 if (m == NULL)
3183 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003184 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00003185 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003186 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00003187 goto ucnhashError;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003188 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003189 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003190 if (ucnhash_CAPI == NULL)
3191 goto ucnhashError;
3192 }
3193 if (*s == '{') {
3194 const char *start = s+1;
3195 /* look for the closing brace */
3196 while (*s != '}' && s < end)
3197 s++;
3198 if (s > start && s < end && *s == '}') {
3199 /* found a name. look it up in the unicode database */
3200 message = "unknown Unicode character name";
3201 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003202 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003203 goto store;
3204 }
3205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003206 endinpos = s-starts;
3207 outpos = p-PyUnicode_AS_UNICODE(v);
3208 if (unicode_decode_call_errorhandler(
3209 errors, &errorHandler,
3210 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003211 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003212 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003213 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003214 break;
3215
3216 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003217 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 message = "\\ at end of string";
3219 s--;
3220 endinpos = s-starts;
3221 outpos = p-PyUnicode_AS_UNICODE(v);
3222 if (unicode_decode_call_errorhandler(
3223 errors, &errorHandler,
3224 "unicodeescape", message,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003225 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003226 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003227 goto onError;
3228 }
3229 else {
3230 *p++ = '\\';
3231 *p++ = (unsigned char)s[-1];
3232 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003233 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003235 nextByte:
3236 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003238 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003243
Fredrik Lundhccc74732001-02-18 22:13:49 +00003244ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003245 PyErr_SetString(
3246 PyExc_UnicodeError,
3247 "\\N escapes not supported (can't load unicodedata module)"
3248 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003249 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003250 Py_XDECREF(errorHandler);
3251 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003252 return NULL;
3253
Fredrik Lundhccc74732001-02-18 22:13:49 +00003254onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 Py_XDECREF(errorHandler);
3257 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 return NULL;
3259}
3260
3261/* Return a Unicode-Escape string version of the Unicode object.
3262
3263 If quotes is true, the string is enclosed in u"" or u'' quotes as
3264 appropriate.
3265
3266*/
3267
Thomas Wouters477c8d52006-05-27 19:21:47 +00003268Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3269 Py_ssize_t size,
3270 Py_UNICODE ch)
3271{
3272 /* like wcschr, but doesn't stop at NULL characters */
3273
3274 while (size-- > 0) {
3275 if (*s == ch)
3276 return s;
3277 s++;
3278 }
3279
3280 return NULL;
3281}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003282
Walter Dörwald79e913e2007-05-12 11:08:06 +00003283static const char *hexdigits = "0123456789abcdef";
3284
3285PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3286 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003288 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003291#ifdef Py_UNICODE_WIDE
3292 const Py_ssize_t expandsize = 10;
3293#else
3294 const Py_ssize_t expandsize = 6;
3295#endif
3296
Thomas Wouters89f507f2006-12-13 04:49:30 +00003297 /* XXX(nnorwitz): rather than over-allocating, it would be
3298 better to choose a different scheme. Perhaps scan the
3299 first N-chars of the string and allocate based on that size.
3300 */
3301 /* Initial allocation is based on the longest-possible unichr
3302 escape.
3303
3304 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3305 unichr, so in this case it's the longest unichr escape. In
3306 narrow (UTF-16) builds this is five chars per source unichr
3307 since there are two unichrs in the surrogate pair, so in narrow
3308 (UTF-16) builds it's not the longest unichr escape.
3309
3310 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3311 so in the narrow (UTF-16) build case it's the longest unichr
3312 escape.
3313 */
3314
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003315 if (size == 0)
3316 return PyBytes_FromStringAndSize(NULL, 0);
3317
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003318 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3319 return PyErr_NoMemory();
3320
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003321 repr = PyBytes_FromStringAndSize(NULL,
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003322 2
3323 + expandsize*size
Thomas Wouters89f507f2006-12-13 04:49:30 +00003324 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325 if (repr == NULL)
3326 return NULL;
3327
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003328 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 while (size-- > 0) {
3331 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003332
Walter Dörwald79e913e2007-05-12 11:08:06 +00003333 /* Escape backslashes */
3334 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 *p++ = '\\';
3336 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003337 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003338 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003339
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003340#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003341 /* Map 21-bit characters to '\U00xxxxxx' */
3342 else if (ch >= 0x10000) {
3343 *p++ = '\\';
3344 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003345 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3346 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3347 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3348 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3349 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3350 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3351 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3352 *p++ = hexdigits[ch & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003353 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003354 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003355#else
3356 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003357 else if (ch >= 0xD800 && ch < 0xDC00) {
3358 Py_UNICODE ch2;
3359 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003360
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003361 ch2 = *s++;
3362 size--;
3363 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3364 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3365 *p++ = '\\';
3366 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003367 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3368 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3369 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3370 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3371 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3372 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3373 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3374 *p++ = hexdigits[ucs & 0x0000000F];
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003375 continue;
3376 }
3377 /* Fall through: isolated surrogates are copied as-is */
3378 s--;
3379 size++;
3380 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003381#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003382
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003384 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 *p++ = '\\';
3386 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003387 *p++ = hexdigits[(ch >> 12) & 0x000F];
3388 *p++ = hexdigits[(ch >> 8) & 0x000F];
3389 *p++ = hexdigits[(ch >> 4) & 0x000F];
3390 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003392
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003393 /* Map special whitespace to '\t', \n', '\r' */
3394 else if (ch == '\t') {
3395 *p++ = '\\';
3396 *p++ = 't';
3397 }
3398 else if (ch == '\n') {
3399 *p++ = '\\';
3400 *p++ = 'n';
3401 }
3402 else if (ch == '\r') {
3403 *p++ = '\\';
3404 *p++ = 'r';
3405 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003406
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003407 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003408 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003410 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003411 *p++ = hexdigits[(ch >> 4) & 0x000F];
3412 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003413 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003414
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415 /* Copy everything else as-is */
3416 else
3417 *p++ = (char) ch;
3418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003420 assert(p - PyBytes_AS_STRING(repr) > 0);
3421 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3422 return NULL;
3423 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424}
3425
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003426PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003428 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 if (!PyUnicode_Check(unicode)) {
3430 PyErr_BadArgument();
3431 return NULL;
3432 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003433 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3434 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003435 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436}
3437
3438/* --- Raw Unicode Escape Codec ------------------------------------------- */
3439
3440PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003441 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 const char *errors)
3443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003445 Py_ssize_t startinpos;
3446 Py_ssize_t endinpos;
3447 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 const char *end;
3451 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 PyObject *errorHandler = NULL;
3453 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003454
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 /* Escaped strings will always be longer than the resulting
3456 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 length after conversion to the true value. (But decoding error
3458 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 v = _PyUnicode_New(size);
3460 if (v == NULL)
3461 goto onError;
3462 if (size == 0)
3463 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 end = s + size;
3466 while (s < end) {
3467 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00003468 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003470 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471
3472 /* Non-escape characters are interpreted as Unicode ordinals */
3473 if (*s != '\\') {
3474 *p++ = (unsigned char)*s++;
3475 continue;
3476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478
3479 /* \u-escapes are only interpreted iff the number of leading
3480 backslashes if odd */
3481 bs = s;
3482 for (;s < end;) {
3483 if (*s != '\\')
3484 break;
3485 *p++ = (unsigned char)*s++;
3486 }
3487 if (((s - bs) & 1) == 0 ||
3488 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003489 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 continue;
3491 }
3492 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003493 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 s++;
3495
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003496 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003498 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003499 c = (unsigned char)*s;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003500 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 endinpos = s-starts;
3502 if (unicode_decode_call_errorhandler(
3503 errors, &errorHandler,
3504 "rawunicodeescape", "truncated \\uXXXX",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003505 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003506 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 }
3510 x = (x<<4) & ~0xF;
3511 if (c >= '0' && c <= '9')
3512 x += c - '0';
3513 else if (c >= 'a' && c <= 'f')
3514 x += 10 + c - 'a';
3515 else
3516 x += 10 + c - 'A';
3517 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003518 if (x <= 0xffff)
3519 /* UCS-2 character */
3520 *p++ = (Py_UNICODE) x;
3521 else if (x <= 0x10ffff) {
3522 /* UCS-4 character. Either store directly, or as
3523 surrogate pair. */
3524#ifdef Py_UNICODE_WIDE
Christian Heimescc47b052008-03-25 14:56:36 +00003525 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003526#else
3527 x -= 0x10000L;
3528 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3529 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3530#endif
3531 } else {
3532 endinpos = s-starts;
3533 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003534 if (unicode_decode_call_errorhandler(
3535 errors, &errorHandler,
3536 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Walter Dörwalde78178e2007-07-30 13:31:40 +00003537 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003538 &v, &outpos, &p))
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003539 goto onError;
3540 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 nextByte:
3542 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003544 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003545 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 Py_XDECREF(errorHandler);
3547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003549
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550 onError:
3551 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003552 Py_XDECREF(errorHandler);
3553 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 return NULL;
3555}
3556
3557PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003558 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003560 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 char *p;
3562 char *q;
3563
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003564#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003565 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003566#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003567 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003568#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003569
3570 if (size > PY_SSIZE_T_MAX / expandsize)
3571 return PyErr_NoMemory();
3572
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003573 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 if (repr == NULL)
3575 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003576 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003577 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003579 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 while (size-- > 0) {
3581 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003582#ifdef Py_UNICODE_WIDE
3583 /* Map 32-bit characters to '\Uxxxxxxxx' */
3584 if (ch >= 0x10000) {
3585 *p++ = '\\';
3586 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003587 *p++ = hexdigits[(ch >> 28) & 0xf];
3588 *p++ = hexdigits[(ch >> 24) & 0xf];
3589 *p++ = hexdigits[(ch >> 20) & 0xf];
3590 *p++ = hexdigits[(ch >> 16) & 0xf];
3591 *p++ = hexdigits[(ch >> 12) & 0xf];
3592 *p++ = hexdigits[(ch >> 8) & 0xf];
3593 *p++ = hexdigits[(ch >> 4) & 0xf];
3594 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003595 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003596 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003597#else
3598 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3599 if (ch >= 0xD800 && ch < 0xDC00) {
3600 Py_UNICODE ch2;
3601 Py_UCS4 ucs;
3602
3603 ch2 = *s++;
3604 size--;
3605 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3606 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3607 *p++ = '\\';
3608 *p++ = 'U';
3609 *p++ = hexdigits[(ucs >> 28) & 0xf];
3610 *p++ = hexdigits[(ucs >> 24) & 0xf];
3611 *p++ = hexdigits[(ucs >> 20) & 0xf];
3612 *p++ = hexdigits[(ucs >> 16) & 0xf];
3613 *p++ = hexdigits[(ucs >> 12) & 0xf];
3614 *p++ = hexdigits[(ucs >> 8) & 0xf];
3615 *p++ = hexdigits[(ucs >> 4) & 0xf];
3616 *p++ = hexdigits[ucs & 0xf];
3617 continue;
3618 }
3619 /* Fall through: isolated surrogates are copied as-is */
3620 s--;
3621 size++;
3622 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003623#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 /* Map 16-bit characters to '\uxxxx' */
3625 if (ch >= 256) {
3626 *p++ = '\\';
3627 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003628 *p++ = hexdigits[(ch >> 12) & 0xf];
3629 *p++ = hexdigits[(ch >> 8) & 0xf];
3630 *p++ = hexdigits[(ch >> 4) & 0xf];
3631 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 }
3633 /* Copy everything else as-is */
3634 else
3635 *p++ = (char) ch;
3636 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003637 size = p - q;
3638
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003639 assert(size > 0);
3640 if (_PyBytes_Resize(&repr, size) < 0)
3641 return NULL;
3642 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643}
3644
3645PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3646{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003647 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00003649 PyErr_BadArgument();
3650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 }
Walter Dörwald711005d2007-05-12 12:03:26 +00003652 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3653 PyUnicode_GET_SIZE(unicode));
3654
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003655 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656}
3657
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003658/* --- Unicode Internal Codec ------------------------------------------- */
3659
3660PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003661 Py_ssize_t size,
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003662 const char *errors)
3663{
3664 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003665 Py_ssize_t startinpos;
3666 Py_ssize_t endinpos;
3667 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003668 PyUnicodeObject *v;
3669 Py_UNICODE *p;
3670 const char *end;
3671 const char *reason;
3672 PyObject *errorHandler = NULL;
3673 PyObject *exc = NULL;
3674
Neal Norwitzd43069c2006-01-08 01:12:10 +00003675#ifdef Py_UNICODE_WIDE
3676 Py_UNICODE unimax = PyUnicode_GetMax();
3677#endif
3678
Thomas Wouters89f507f2006-12-13 04:49:30 +00003679 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003680 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3681 if (v == NULL)
3682 goto onError;
3683 if (PyUnicode_GetSize((PyObject *)v) == 0)
3684 return (PyObject *)v;
3685 p = PyUnicode_AS_UNICODE(v);
3686 end = s + size;
3687
3688 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00003689 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003690 /* We have to sanity check the raw data, otherwise doom looms for
3691 some malformed UCS-4 data. */
3692 if (
3693 #ifdef Py_UNICODE_WIDE
3694 *p > unimax || *p < 0 ||
3695 #endif
3696 end-s < Py_UNICODE_SIZE
3697 )
3698 {
3699 startinpos = s - starts;
3700 if (end-s < Py_UNICODE_SIZE) {
3701 endinpos = end-starts;
3702 reason = "truncated input";
3703 }
3704 else {
3705 endinpos = s - starts + Py_UNICODE_SIZE;
3706 reason = "illegal code point (> 0x10FFFF)";
3707 }
3708 outpos = p - PyUnicode_AS_UNICODE(v);
3709 if (unicode_decode_call_errorhandler(
3710 errors, &errorHandler,
3711 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00003712 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00003713 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003714 goto onError;
3715 }
3716 }
3717 else {
3718 p++;
3719 s += Py_UNICODE_SIZE;
3720 }
3721 }
3722
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003723 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003724 goto onError;
3725 Py_XDECREF(errorHandler);
3726 Py_XDECREF(exc);
3727 return (PyObject *)v;
3728
3729 onError:
3730 Py_XDECREF(v);
3731 Py_XDECREF(errorHandler);
3732 Py_XDECREF(exc);
3733 return NULL;
3734}
3735
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736/* --- Latin-1 Codec ------------------------------------------------------ */
3737
3738PyObject *PyUnicode_DecodeLatin1(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003739 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740 const char *errors)
3741{
3742 PyUnicodeObject *v;
3743 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003744 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00003745
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003747 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003748 Py_UNICODE r = *(unsigned char*)s;
3749 return PyUnicode_FromUnicode(&r, 1);
3750 }
3751
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 v = _PyUnicode_New(size);
3753 if (v == NULL)
3754 goto onError;
3755 if (size == 0)
3756 return (PyObject *)v;
3757 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00003758 e = s + size;
3759 /* Unrolling the copy makes it much faster by reducing the looping
3760 overhead. This is similar to what many memcpy() implementations do. */
3761 unrolled_end = e - 4;
3762 while (s < unrolled_end) {
3763 p[0] = (unsigned char) s[0];
3764 p[1] = (unsigned char) s[1];
3765 p[2] = (unsigned char) s[2];
3766 p[3] = (unsigned char) s[3];
3767 s += 4;
3768 p += 4;
3769 }
3770 while (s < e)
3771 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003773
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 onError:
3775 Py_XDECREF(v);
3776 return NULL;
3777}
3778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779/* create or adjust a UnicodeEncodeError */
3780static void make_encode_exception(PyObject **exceptionObject,
3781 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003782 const Py_UNICODE *unicode, Py_ssize_t size,
3783 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 if (*exceptionObject == NULL) {
3787 *exceptionObject = PyUnicodeEncodeError_Create(
3788 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 }
3790 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3792 goto onError;
3793 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3794 goto onError;
3795 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3796 goto onError;
3797 return;
3798 onError:
3799 Py_DECREF(*exceptionObject);
3800 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 }
3802}
3803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804/* raises a UnicodeEncodeError */
3805static void raise_encode_exception(PyObject **exceptionObject,
3806 const char *encoding,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003807 const Py_UNICODE *unicode, Py_ssize_t size,
3808 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 const char *reason)
3810{
3811 make_encode_exception(exceptionObject,
3812 encoding, unicode, size, startpos, endpos, reason);
3813 if (*exceptionObject != NULL)
3814 PyCodec_StrictErrors(*exceptionObject);
3815}
3816
3817/* error handling callback helper:
3818 build arguments, call the callback and check the arguments,
3819 put the result into newpos and return the replacement string, which
3820 has to be freed by the caller */
3821static PyObject *unicode_encode_call_errorhandler(const char *errors,
3822 PyObject **errorHandler,
3823 const char *encoding, const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003824 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3825 Py_ssize_t startpos, Py_ssize_t endpos,
3826 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003828 static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829
3830 PyObject *restuple;
3831 PyObject *resunicode;
3832
3833 if (*errorHandler == NULL) {
3834 *errorHandler = PyCodec_LookupError(errors);
3835 if (*errorHandler == NULL)
3836 return NULL;
3837 }
3838
3839 make_encode_exception(exceptionObject,
3840 encoding, unicode, size, startpos, endpos, reason);
3841 if (*exceptionObject == NULL)
3842 return NULL;
3843
3844 restuple = PyObject_CallFunctionObjArgs(
3845 *errorHandler, *exceptionObject, NULL);
3846 if (restuple == NULL)
3847 return NULL;
3848 if (!PyTuple_Check(restuple)) {
3849 PyErr_Format(PyExc_TypeError, &argparse[4]);
3850 Py_DECREF(restuple);
3851 return NULL;
3852 }
3853 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3854 &resunicode, newpos)) {
3855 Py_DECREF(restuple);
3856 return NULL;
3857 }
3858 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003859 *newpos = size+*newpos;
3860 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00003861 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003862 Py_DECREF(restuple);
3863 return NULL;
3864 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 Py_INCREF(resunicode);
3866 Py_DECREF(restuple);
3867 return resunicode;
3868}
3869
3870static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00003871 Py_ssize_t size,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 const char *errors,
3873 int limit)
3874{
3875 /* output object */
3876 PyObject *res;
3877 /* pointers to the beginning and end+1 of input */
3878 const Py_UNICODE *startp = p;
3879 const Py_UNICODE *endp = p + size;
3880 /* pointer to the beginning of the unencodable characters */
3881 /* const Py_UNICODE *badp = NULL; */
3882 /* pointer into the output */
3883 char *str;
3884 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003885 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003886 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3887 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 PyObject *errorHandler = NULL;
3889 PyObject *exc = NULL;
3890 /* the following variable is used for caching string comparisons
3891 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3892 int known_errorHandler = -1;
3893
3894 /* allocate enough for a simple encoding without
3895 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00003896 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00003897 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003898 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003899 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003900 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003901 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003902 ressize = size;
3903
3904 while (p<endp) {
3905 Py_UNICODE c = *p;
3906
3907 /* can we encode this? */
3908 if (c<limit) {
3909 /* no overflow check, because we know that the space is enough */
3910 *str++ = (char)c;
3911 ++p;
3912 }
3913 else {
Martin v. Löwis18e16552006-02-15 17:27:45 +00003914 Py_ssize_t unicodepos = p-startp;
3915 Py_ssize_t requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003917 Py_ssize_t repsize;
3918 Py_ssize_t newpos;
3919 Py_ssize_t respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 Py_UNICODE *uni2;
3921 /* startpos for collecting unencodable chars */
3922 const Py_UNICODE *collstart = p;
3923 const Py_UNICODE *collend = p;
3924 /* find all unecodable characters */
3925 while ((collend < endp) && ((*collend)>=limit))
3926 ++collend;
3927 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3928 if (known_errorHandler==-1) {
3929 if ((errors==NULL) || (!strcmp(errors, "strict")))
3930 known_errorHandler = 1;
3931 else if (!strcmp(errors, "replace"))
3932 known_errorHandler = 2;
3933 else if (!strcmp(errors, "ignore"))
3934 known_errorHandler = 3;
3935 else if (!strcmp(errors, "xmlcharrefreplace"))
3936 known_errorHandler = 4;
3937 else
3938 known_errorHandler = 0;
3939 }
3940 switch (known_errorHandler) {
3941 case 1: /* strict */
3942 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3943 goto onError;
3944 case 2: /* replace */
3945 while (collstart++<collend)
3946 *str++ = '?'; /* fall through */
3947 case 3: /* ignore */
3948 p = collend;
3949 break;
3950 case 4: /* xmlcharrefreplace */
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003951 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 /* determine replacement size (temporarily (mis)uses p) */
3953 for (p = collstart, repsize = 0; p < collend; ++p) {
3954 if (*p<10)
3955 repsize += 2+1+1;
3956 else if (*p<100)
3957 repsize += 2+2+1;
3958 else if (*p<1000)
3959 repsize += 2+3+1;
3960 else if (*p<10000)
3961 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003962#ifndef Py_UNICODE_WIDE
3963 else
3964 repsize += 2+5+1;
3965#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 else if (*p<100000)
3967 repsize += 2+5+1;
3968 else if (*p<1000000)
3969 repsize += 2+6+1;
3970 else
3971 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003972#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 }
3974 requiredsize = respos+repsize+(endp-collend);
3975 if (requiredsize > ressize) {
3976 if (requiredsize<2*ressize)
3977 requiredsize = 2*ressize;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003978 if (_PyBytes_Resize(&res, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 goto onError;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003980 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 ressize = requiredsize;
3982 }
3983 /* generate replacement (temporarily (mis)uses p) */
3984 for (p = collstart; p < collend; ++p) {
3985 str += sprintf(str, "&#%d;", (int)*p);
3986 }
3987 p = collend;
3988 break;
3989 default:
3990 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3991 encoding, reason, startp, size, &exc,
3992 collstart-startp, collend-startp, &newpos);
3993 if (repunicode == NULL)
3994 goto onError;
3995 /* need more space? (at least enough for what we
3996 have+the replacement+the rest of the string, so
3997 we won't have to check space for encodable characters) */
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003998 respos = str - PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 repsize = PyUnicode_GET_SIZE(repunicode);
4000 requiredsize = respos+repsize+(endp-collend);
4001 if (requiredsize > ressize) {
4002 if (requiredsize<2*ressize)
4003 requiredsize = 2*ressize;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004004 if (_PyBytes_Resize(&res, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 Py_DECREF(repunicode);
4006 goto onError;
4007 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004008 str = PyBytes_AS_STRING(res) + respos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 ressize = requiredsize;
4010 }
4011 /* check if there is anything unencodable in the replacement
4012 and copy it to the output */
4013 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4014 c = *uni2;
4015 if (c >= limit) {
4016 raise_encode_exception(&exc, encoding, startp, size,
4017 unicodepos, unicodepos+1, reason);
4018 Py_DECREF(repunicode);
4019 goto onError;
4020 }
4021 *str = (char)c;
4022 }
4023 p = startp + newpos;
4024 Py_DECREF(repunicode);
4025 }
4026 }
4027 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004028 /* Resize if we allocated to much */
4029 size = str - PyBytes_AS_STRING(res);
4030 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004031 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004032 if (_PyBytes_Resize(&res, size) < 0)
4033 goto onError;
4034 }
4035
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 Py_XDECREF(errorHandler);
4037 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004038 return res;
4039
4040 onError:
4041 Py_XDECREF(res);
4042 Py_XDECREF(errorHandler);
4043 Py_XDECREF(exc);
4044 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045}
4046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004048 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 const char *errors)
4050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052}
4053
4054PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4055{
4056 if (!PyUnicode_Check(unicode)) {
4057 PyErr_BadArgument();
4058 return NULL;
4059 }
4060 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4061 PyUnicode_GET_SIZE(unicode),
4062 NULL);
4063}
4064
4065/* --- 7-bit ASCII Codec -------------------------------------------------- */
4066
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067PyObject *PyUnicode_DecodeASCII(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 const char *errors)
4070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 PyUnicodeObject *v;
4073 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004074 Py_ssize_t startinpos;
4075 Py_ssize_t endinpos;
4076 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 const char *e;
4078 PyObject *errorHandler = NULL;
4079 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004080
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004082 if (size == 1 && *(unsigned char*)s < 128) {
4083 Py_UNICODE r = *(unsigned char*)s;
4084 return PyUnicode_FromUnicode(&r, 1);
4085 }
Tim Petersced69f82003-09-16 20:30:58 +00004086
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 v = _PyUnicode_New(size);
4088 if (v == NULL)
4089 goto onError;
4090 if (size == 0)
4091 return (PyObject *)v;
4092 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 e = s + size;
4094 while (s < e) {
4095 register unsigned char c = (unsigned char)*s;
4096 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098 ++s;
4099 }
4100 else {
4101 startinpos = s-starts;
4102 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00004103 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 if (unicode_decode_call_errorhandler(
4105 errors, &errorHandler,
4106 "ascii", "ordinal not in range(128)",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004107 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004108 &v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004112 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004113 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004114 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 Py_XDECREF(errorHandler);
4116 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004118
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 onError:
4120 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 Py_XDECREF(errorHandler);
4122 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 return NULL;
4124}
4125
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004127 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 const char *errors)
4129{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131}
4132
4133PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4134{
4135 if (!PyUnicode_Check(unicode)) {
4136 PyErr_BadArgument();
4137 return NULL;
4138 }
4139 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4140 PyUnicode_GET_SIZE(unicode),
4141 NULL);
4142}
4143
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004144#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004145
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004146/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004147
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004148#if SIZEOF_INT < SIZEOF_SSIZE_T
4149#define NEED_RETRY
4150#endif
4151
4152/* XXX This code is limited to "true" double-byte encodings, as
4153 a) it assumes an incomplete character consists of a single byte, and
4154 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4155 encodings, see IsDBCSLeadByteEx documentation. */
4156
4157static int is_dbcs_lead_byte(const char *s, int offset)
4158{
4159 const char *curr = s + offset;
4160
4161 if (IsDBCSLeadByte(*curr)) {
4162 const char *prev = CharPrev(s, curr);
4163 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4164 }
4165 return 0;
4166}
4167
4168/*
4169 * Decode MBCS string into unicode object. If 'final' is set, converts
4170 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4171 */
4172static int decode_mbcs(PyUnicodeObject **v,
4173 const char *s, /* MBCS string */
4174 int size, /* sizeof MBCS string */
4175 int final)
4176{
4177 Py_UNICODE *p;
4178 Py_ssize_t n = 0;
4179 int usize = 0;
4180
4181 assert(size >= 0);
4182
4183 /* Skip trailing lead-byte unless 'final' is set */
4184 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4185 --size;
4186
4187 /* First get the size of the result */
4188 if (size > 0) {
4189 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4190 if (usize == 0) {
4191 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4192 return -1;
4193 }
4194 }
4195
4196 if (*v == NULL) {
4197 /* Create unicode object */
4198 *v = _PyUnicode_New(usize);
4199 if (*v == NULL)
4200 return -1;
4201 }
4202 else {
4203 /* Extend unicode object */
4204 n = PyUnicode_GET_SIZE(*v);
4205 if (_PyUnicode_Resize(v, n + usize) < 0)
4206 return -1;
4207 }
4208
4209 /* Do the conversion */
4210 if (size > 0) {
4211 p = PyUnicode_AS_UNICODE(*v) + n;
4212 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4213 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4214 return -1;
4215 }
4216 }
4217
4218 return size;
4219}
4220
4221PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4222 Py_ssize_t size,
4223 const char *errors,
4224 Py_ssize_t *consumed)
4225{
4226 PyUnicodeObject *v = NULL;
4227 int done;
4228
4229 if (consumed)
4230 *consumed = 0;
4231
4232#ifdef NEED_RETRY
4233 retry:
4234 if (size > INT_MAX)
4235 done = decode_mbcs(&v, s, INT_MAX, 0);
4236 else
4237#endif
4238 done = decode_mbcs(&v, s, (int)size, !consumed);
4239
4240 if (done < 0) {
4241 Py_XDECREF(v);
4242 return NULL;
4243 }
4244
4245 if (consumed)
4246 *consumed += done;
4247
4248#ifdef NEED_RETRY
4249 if (size > INT_MAX) {
4250 s += done;
4251 size -= done;
4252 goto retry;
4253 }
4254#endif
4255
4256 return (PyObject *)v;
4257}
4258
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004259PyObject *PyUnicode_DecodeMBCS(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004260 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004261 const char *errors)
4262{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004263 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4264}
4265
4266/*
4267 * Convert unicode into string object (MBCS).
4268 * Returns 0 if succeed, -1 otherwise.
4269 */
4270static int encode_mbcs(PyObject **repr,
4271 const Py_UNICODE *p, /* unicode */
4272 int size) /* size of unicode */
4273{
4274 int mbcssize = 0;
4275 Py_ssize_t n = 0;
4276
4277 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004278
4279 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004280 if (size > 0) {
4281 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4282 if (mbcssize == 0) {
4283 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4284 return -1;
4285 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004286 }
4287
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004288 if (*repr == NULL) {
4289 /* Create string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004290 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004291 if (*repr == NULL)
4292 return -1;
4293 }
4294 else {
4295 /* Extend string object */
Christian Heimes72b710a2008-05-26 13:28:38 +00004296 n = PyBytes_Size(*repr);
Hirokazu Yamamotod88e8fa2008-12-27 14:58:17 +00004297 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004298 return -1;
4299 }
4300
4301 /* Do the conversion */
4302 if (size > 0) {
Christian Heimes72b710a2008-05-26 13:28:38 +00004303 char *s = PyBytes_AS_STRING(*repr) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004304 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4305 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4306 return -1;
4307 }
4308 }
4309
4310 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004311}
4312
4313PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004314 Py_ssize_t size,
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004315 const char *errors)
4316{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004317 PyObject *repr = NULL;
4318 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004319
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004320#ifdef NEED_RETRY
4321 retry:
4322 if (size > INT_MAX)
4323 ret = encode_mbcs(&repr, p, INT_MAX);
4324 else
4325#endif
4326 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004327
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004328 if (ret < 0) {
4329 Py_XDECREF(repr);
4330 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004332
4333#ifdef NEED_RETRY
4334 if (size > INT_MAX) {
4335 p += INT_MAX;
4336 size -= INT_MAX;
4337 goto retry;
4338 }
4339#endif
4340
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004341 return repr;
4342}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004343
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004344PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4345{
4346 if (!PyUnicode_Check(unicode)) {
4347 PyErr_BadArgument();
4348 return NULL;
4349 }
4350 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4351 PyUnicode_GET_SIZE(unicode),
4352 NULL);
4353}
4354
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004355#undef NEED_RETRY
4356
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004357#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004358
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359/* --- Character Mapping Codec -------------------------------------------- */
4360
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361PyObject *PyUnicode_DecodeCharmap(const char *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004362 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 PyObject *mapping,
4364 const char *errors)
4365{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004367 Py_ssize_t startinpos;
4368 Py_ssize_t endinpos;
4369 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 PyUnicodeObject *v;
4372 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 PyObject *errorHandler = NULL;
4375 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004376 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004378
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 /* Default to Latin-1 */
4380 if (mapping == NULL)
4381 return PyUnicode_DecodeLatin1(s, size, errors);
4382
4383 v = _PyUnicode_New(size);
4384 if (v == NULL)
4385 goto onError;
4386 if (size == 0)
4387 return (PyObject *)v;
4388 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004390 if (PyUnicode_CheckExact(mapping)) {
4391 mapstring = PyUnicode_AS_UNICODE(mapping);
4392 maplen = PyUnicode_GET_SIZE(mapping);
4393 while (s < e) {
4394 unsigned char ch = *s;
4395 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004397 if (ch < maplen)
4398 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004400 if (x == 0xfffe) {
4401 /* undefined mapping */
4402 outpos = p-PyUnicode_AS_UNICODE(v);
4403 startinpos = s-starts;
4404 endinpos = startinpos+1;
4405 if (unicode_decode_call_errorhandler(
4406 errors, &errorHandler,
4407 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004409 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004410 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004411 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004412 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00004413 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004414 *p++ = x;
4415 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004417 }
4418 else {
4419 while (s < e) {
4420 unsigned char ch = *s;
4421 PyObject *w, *x;
4422
4423 /* Get mapping (char ordinal -> integer, Unicode char or None) */
Christian Heimes217cfd12007-12-02 14:31:20 +00004424 w = PyLong_FromLong((long)ch);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004425 if (w == NULL)
4426 goto onError;
4427 x = PyObject_GetItem(mapping, w);
4428 Py_DECREF(w);
4429 if (x == NULL) {
4430 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4431 /* No mapping found means: mapping is undefined. */
4432 PyErr_Clear();
4433 x = Py_None;
4434 Py_INCREF(x);
4435 } else
4436 goto onError;
4437 }
4438
4439 /* Apply mapping */
Christian Heimes217cfd12007-12-02 14:31:20 +00004440 if (PyLong_Check(x)) {
4441 long value = PyLong_AS_LONG(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004442 if (value < 0 || value > 65535) {
4443 PyErr_SetString(PyExc_TypeError,
4444 "character mapping must be in range(65536)");
4445 Py_DECREF(x);
4446 goto onError;
4447 }
4448 *p++ = (Py_UNICODE)value;
4449 }
4450 else if (x == Py_None) {
4451 /* undefined mapping */
4452 outpos = p-PyUnicode_AS_UNICODE(v);
4453 startinpos = s-starts;
4454 endinpos = startinpos+1;
4455 if (unicode_decode_call_errorhandler(
4456 errors, &errorHandler,
4457 "charmap", "character maps to <undefined>",
Walter Dörwalde78178e2007-07-30 13:31:40 +00004458 &starts, &e, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004459 &v, &outpos, &p)) {
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004460 Py_DECREF(x);
4461 goto onError;
4462 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00004463 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004464 continue;
4465 }
4466 else if (PyUnicode_Check(x)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004467 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004468
4469 if (targetsize == 1)
4470 /* 1-1 mapping */
4471 *p++ = *PyUnicode_AS_UNICODE(x);
4472
4473 else if (targetsize > 1) {
4474 /* 1-n mapping */
4475 if (targetsize > extrachars) {
4476 /* resize first */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004477 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4478 Py_ssize_t needed = (targetsize - extrachars) + \
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004479 (targetsize << 2);
4480 extrachars += needed;
Thomas Wouters89f507f2006-12-13 04:49:30 +00004481 /* XXX overflow detection missing */
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004482 if (_PyUnicode_Resize(&v,
4483 PyUnicode_GET_SIZE(v) + needed) < 0) {
4484 Py_DECREF(x);
4485 goto onError;
4486 }
4487 p = PyUnicode_AS_UNICODE(v) + oldpos;
4488 }
4489 Py_UNICODE_COPY(p,
4490 PyUnicode_AS_UNICODE(x),
4491 targetsize);
4492 p += targetsize;
4493 extrachars -= targetsize;
4494 }
4495 /* 1-0 mapping: skip the character */
4496 }
4497 else {
4498 /* wrong return value */
4499 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00004500 "character mapping must return integer, None or str");
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004501 Py_DECREF(x);
4502 goto onError;
4503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004505 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 }
4508 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004509 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 Py_XDECREF(errorHandler);
4512 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004514
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 Py_XDECREF(errorHandler);
4517 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 Py_XDECREF(v);
4519 return NULL;
4520}
4521
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004522/* Charmap encoding: the lookup table */
4523
4524struct encoding_map{
4525 PyObject_HEAD
4526 unsigned char level1[32];
4527 int count2, count3;
4528 unsigned char level23[1];
4529};
4530
4531static PyObject*
4532encoding_map_size(PyObject *obj, PyObject* args)
4533{
4534 struct encoding_map *map = (struct encoding_map*)obj;
Christian Heimes217cfd12007-12-02 14:31:20 +00004535 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004536 128*map->count3);
4537}
4538
4539static PyMethodDef encoding_map_methods[] = {
4540 {"size", encoding_map_size, METH_NOARGS,
4541 PyDoc_STR("Return the size (in bytes) of this object") },
4542 { 0 }
4543};
4544
4545static void
4546encoding_map_dealloc(PyObject* o)
4547{
4548 PyObject_FREE(o);
4549}
4550
4551static PyTypeObject EncodingMapType = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00004552 PyVarObject_HEAD_INIT(NULL, 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004553 "EncodingMap", /*tp_name*/
4554 sizeof(struct encoding_map), /*tp_basicsize*/
4555 0, /*tp_itemsize*/
4556 /* methods */
4557 encoding_map_dealloc, /*tp_dealloc*/
4558 0, /*tp_print*/
4559 0, /*tp_getattr*/
4560 0, /*tp_setattr*/
4561 0, /*tp_compare*/
4562 0, /*tp_repr*/
4563 0, /*tp_as_number*/
4564 0, /*tp_as_sequence*/
4565 0, /*tp_as_mapping*/
4566 0, /*tp_hash*/
4567 0, /*tp_call*/
4568 0, /*tp_str*/
4569 0, /*tp_getattro*/
4570 0, /*tp_setattro*/
4571 0, /*tp_as_buffer*/
4572 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4573 0, /*tp_doc*/
4574 0, /*tp_traverse*/
4575 0, /*tp_clear*/
4576 0, /*tp_richcompare*/
4577 0, /*tp_weaklistoffset*/
4578 0, /*tp_iter*/
4579 0, /*tp_iternext*/
4580 encoding_map_methods, /*tp_methods*/
4581 0, /*tp_members*/
4582 0, /*tp_getset*/
4583 0, /*tp_base*/
4584 0, /*tp_dict*/
4585 0, /*tp_descr_get*/
4586 0, /*tp_descr_set*/
4587 0, /*tp_dictoffset*/
4588 0, /*tp_init*/
4589 0, /*tp_alloc*/
4590 0, /*tp_new*/
4591 0, /*tp_free*/
4592 0, /*tp_is_gc*/
4593};
4594
4595PyObject*
4596PyUnicode_BuildEncodingMap(PyObject* string)
4597{
4598 Py_UNICODE *decode;
4599 PyObject *result;
4600 struct encoding_map *mresult;
4601 int i;
4602 int need_dict = 0;
4603 unsigned char level1[32];
4604 unsigned char level2[512];
4605 unsigned char *mlevel1, *mlevel2, *mlevel3;
4606 int count2 = 0, count3 = 0;
4607
4608 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4609 PyErr_BadArgument();
4610 return NULL;
4611 }
4612 decode = PyUnicode_AS_UNICODE(string);
4613 memset(level1, 0xFF, sizeof level1);
4614 memset(level2, 0xFF, sizeof level2);
4615
4616 /* If there isn't a one-to-one mapping of NULL to \0,
4617 or if there are non-BMP characters, we need to use
4618 a mapping dictionary. */
4619 if (decode[0] != 0)
4620 need_dict = 1;
4621 for (i = 1; i < 256; i++) {
4622 int l1, l2;
4623 if (decode[i] == 0
4624 #ifdef Py_UNICODE_WIDE
4625 || decode[i] > 0xFFFF
4626 #endif
4627 ) {
4628 need_dict = 1;
4629 break;
4630 }
4631 if (decode[i] == 0xFFFE)
4632 /* unmapped character */
4633 continue;
4634 l1 = decode[i] >> 11;
4635 l2 = decode[i] >> 7;
4636 if (level1[l1] == 0xFF)
4637 level1[l1] = count2++;
4638 if (level2[l2] == 0xFF)
4639 level2[l2] = count3++;
4640 }
4641
4642 if (count2 >= 0xFF || count3 >= 0xFF)
4643 need_dict = 1;
4644
4645 if (need_dict) {
4646 PyObject *result = PyDict_New();
4647 PyObject *key, *value;
4648 if (!result)
4649 return NULL;
4650 for (i = 0; i < 256; i++) {
4651 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00004652 key = PyLong_FromLong(decode[i]);
4653 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004654 if (!key || !value)
4655 goto failed1;
4656 if (PyDict_SetItem(result, key, value) == -1)
4657 goto failed1;
4658 Py_DECREF(key);
4659 Py_DECREF(value);
4660 }
4661 return result;
4662 failed1:
4663 Py_XDECREF(key);
4664 Py_XDECREF(value);
4665 Py_DECREF(result);
4666 return NULL;
4667 }
4668
4669 /* Create a three-level trie */
4670 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4671 16*count2 + 128*count3 - 1);
4672 if (!result)
4673 return PyErr_NoMemory();
4674 PyObject_Init(result, &EncodingMapType);
4675 mresult = (struct encoding_map*)result;
4676 mresult->count2 = count2;
4677 mresult->count3 = count3;
4678 mlevel1 = mresult->level1;
4679 mlevel2 = mresult->level23;
4680 mlevel3 = mresult->level23 + 16*count2;
4681 memcpy(mlevel1, level1, 32);
4682 memset(mlevel2, 0xFF, 16*count2);
4683 memset(mlevel3, 0, 128*count3);
4684 count3 = 0;
4685 for (i = 1; i < 256; i++) {
4686 int o1, o2, o3, i2, i3;
4687 if (decode[i] == 0xFFFE)
4688 /* unmapped character */
4689 continue;
4690 o1 = decode[i]>>11;
4691 o2 = (decode[i]>>7) & 0xF;
4692 i2 = 16*mlevel1[o1] + o2;
4693 if (mlevel2[i2] == 0xFF)
4694 mlevel2[i2] = count3++;
4695 o3 = decode[i] & 0x7F;
4696 i3 = 128*mlevel2[i2] + o3;
4697 mlevel3[i3] = i;
4698 }
4699 return result;
4700}
4701
4702static int
4703encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4704{
4705 struct encoding_map *map = (struct encoding_map*)mapping;
4706 int l1 = c>>11;
4707 int l2 = (c>>7) & 0xF;
4708 int l3 = c & 0x7F;
4709 int i;
4710
4711#ifdef Py_UNICODE_WIDE
4712 if (c > 0xFFFF) {
4713 return -1;
4714 }
4715#endif
4716 if (c == 0)
4717 return 0;
4718 /* level 1*/
4719 i = map->level1[l1];
4720 if (i == 0xFF) {
4721 return -1;
4722 }
4723 /* level 2*/
4724 i = map->level23[16*i+l2];
4725 if (i == 0xFF) {
4726 return -1;
4727 }
4728 /* level 3 */
4729 i = map->level23[16*map->count2 + 128*i + l3];
4730 if (i == 0) {
4731 return -1;
4732 }
4733 return i;
4734}
4735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736/* Lookup the character ch in the mapping. If the character
4737 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004738 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740{
Christian Heimes217cfd12007-12-02 14:31:20 +00004741 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004742 PyObject *x;
4743
4744 if (w == NULL)
4745 return NULL;
4746 x = PyObject_GetItem(mapping, w);
4747 Py_DECREF(w);
4748 if (x == NULL) {
4749 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4750 /* No mapping found means: mapping is undefined. */
4751 PyErr_Clear();
4752 x = Py_None;
4753 Py_INCREF(x);
4754 return x;
4755 } else
4756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004758 else if (x == Py_None)
4759 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00004760 else if (PyLong_Check(x)) {
4761 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004762 if (value < 0 || value > 255) {
4763 PyErr_SetString(PyExc_TypeError,
4764 "character mapping must be in range(256)");
4765 Py_DECREF(x);
4766 return NULL;
4767 }
4768 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004770 else if (PyBytes_Check(x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004771 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773 /* wrong return value */
Walter Dörwald580ceed2007-05-09 10:39:19 +00004774 PyErr_Format(PyExc_TypeError,
Christian Heimesf3863112007-11-22 07:46:41 +00004775 "character mapping must return integer, bytes or None, not %.400s",
Walter Dörwald580ceed2007-05-09 10:39:19 +00004776 x->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 Py_DECREF(x);
4778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 }
4780}
4781
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004782static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00004783charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004784{
Christian Heimes72b710a2008-05-26 13:28:38 +00004785 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004786 /* exponentially overallocate to minimize reallocations */
4787 if (requiredsize < 2*outsize)
4788 requiredsize = 2*outsize;
Christian Heimes72b710a2008-05-26 13:28:38 +00004789 if (_PyBytes_Resize(outobj, requiredsize))
Walter Dörwald827b0552007-05-12 13:23:53 +00004790 return -1;
Walter Dörwald827b0552007-05-12 13:23:53 +00004791 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004792}
4793
4794typedef enum charmapencode_result {
4795 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4796}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00004798 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 space is available. Return a new reference to the object that
4800 was put in the output buffer, or Py_None, if the mapping was undefined
4801 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004802 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004804charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004805 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004807 PyObject *rep;
4808 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00004809 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810
Christian Heimes90aa7642007-12-19 02:45:37 +00004811 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004812 int res = encoding_map_lookup(c, mapping);
4813 Py_ssize_t requiredsize = *outpos+1;
4814 if (res == -1)
4815 return enc_FAILED;
Guido van Rossum98297ee2007-11-06 21:34:58 +00004816 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004817 if (charmapencode_resize(outobj, outpos, requiredsize))
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004818 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00004819 outstart = PyBytes_AS_STRING(*outobj);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004820 outstart[(*outpos)++] = (char)res;
4821 return enc_SUCCESS;
4822 }
4823
4824 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004825 if (rep==NULL)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004826 return enc_EXCEPTION;
4827 else if (rep==Py_None) {
4828 Py_DECREF(rep);
4829 return enc_FAILED;
4830 } else {
Christian Heimes217cfd12007-12-02 14:31:20 +00004831 if (PyLong_Check(rep)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00004832 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004833 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004834 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004836 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004838 outstart = PyBytes_AS_STRING(*outobj);
Christian Heimes217cfd12007-12-02 14:31:20 +00004839 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004840 }
4841 else {
Christian Heimes72b710a2008-05-26 13:28:38 +00004842 const char *repchars = PyBytes_AS_STRING(rep);
4843 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
Martin v. Löwis18e16552006-02-15 17:27:45 +00004844 Py_ssize_t requiredsize = *outpos+repsize;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004845 if (outsize<requiredsize)
Walter Dörwald827b0552007-05-12 13:23:53 +00004846 if (charmapencode_resize(outobj, outpos, requiredsize)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847 Py_DECREF(rep);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004848 return enc_EXCEPTION;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004850 outstart = PyBytes_AS_STRING(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 memcpy(outstart + *outpos, repchars, repsize);
4852 *outpos += repsize;
4853 }
4854 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004855 Py_DECREF(rep);
4856 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857}
4858
4859/* handle an error in PyUnicode_EncodeCharmap
4860 Return 0 on success, -1 on error */
4861static
4862int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004863 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004865 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00004866 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867{
4868 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004869 Py_ssize_t repsize;
4870 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 Py_UNICODE *uni2;
4872 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t collstartpos = *inpos;
4874 Py_ssize_t collendpos = *inpos+1;
4875 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 char *encoding = "charmap";
4877 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004878 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 /* find all unencodable characters */
4881 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004882 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00004883 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004884 int res = encoding_map_lookup(p[collendpos], mapping);
4885 if (res != -1)
4886 break;
4887 ++collendpos;
4888 continue;
4889 }
4890
4891 rep = charmapencode_lookup(p[collendpos], mapping);
4892 if (rep==NULL)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004894 else if (rep!=Py_None) {
4895 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 break;
4897 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004898 Py_DECREF(rep);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 ++collendpos;
4900 }
4901 /* cache callback name lookup
4902 * (if not done yet, i.e. it's the first error) */
4903 if (*known_errorHandler==-1) {
4904 if ((errors==NULL) || (!strcmp(errors, "strict")))
4905 *known_errorHandler = 1;
4906 else if (!strcmp(errors, "replace"))
4907 *known_errorHandler = 2;
4908 else if (!strcmp(errors, "ignore"))
4909 *known_errorHandler = 3;
4910 else if (!strcmp(errors, "xmlcharrefreplace"))
4911 *known_errorHandler = 4;
4912 else
4913 *known_errorHandler = 0;
4914 }
4915 switch (*known_errorHandler) {
4916 case 1: /* strict */
4917 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4918 return -1;
4919 case 2: /* replace */
4920 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4921 x = charmapencode_output('?', mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004922 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004923 return -1;
4924 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004925 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4927 return -1;
4928 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929 }
4930 /* fall through */
4931 case 3: /* ignore */
4932 *inpos = collendpos;
4933 break;
4934 case 4: /* xmlcharrefreplace */
4935 /* generate replacement (temporarily (mis)uses p) */
4936 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4937 char buffer[2+29+1+1];
4938 char *cp;
4939 sprintf(buffer, "&#%d;", (int)p[collpos]);
4940 for (cp = buffer; *cp; ++cp) {
4941 x = charmapencode_output(*cp, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004942 if (x==enc_EXCEPTION)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004943 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004944 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4946 return -1;
4947 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 }
4949 }
4950 *inpos = collendpos;
4951 break;
4952 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004953 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954 encoding, reason, p, size, exceptionObject,
4955 collstartpos, collendpos, &newpos);
4956 if (repunicode == NULL)
4957 return -1;
4958 /* generate replacement */
4959 repsize = PyUnicode_GET_SIZE(repunicode);
4960 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4961 x = charmapencode_output(*uni2, mapping, res, respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004962 if (x==enc_EXCEPTION) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004963 return -1;
4964 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004965 else if (x==enc_FAILED) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004966 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4968 return -1;
4969 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 }
4971 *inpos = newpos;
4972 Py_DECREF(repunicode);
4973 }
4974 return 0;
4975}
4976
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004978 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 PyObject *mapping,
4980 const char *errors)
4981{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004982 /* output object */
4983 PyObject *res = NULL;
4984 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004985 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004987 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988 PyObject *errorHandler = NULL;
4989 PyObject *exc = NULL;
4990 /* the following variable is used for caching string comparisons
4991 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4992 * 3=ignore, 4=xmlcharrefreplace */
4993 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994
4995 /* Default to Latin-1 */
4996 if (mapping == NULL)
4997 return PyUnicode_EncodeLatin1(p, size, errors);
4998
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999 /* allocate enough for a simple encoding without
5000 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005001 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 if (res == NULL)
5003 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005004 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005005 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 while (inpos<size) {
5008 /* try to encode it */
Guido van Rossum98297ee2007-11-06 21:34:58 +00005009 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005010 if (x==enc_EXCEPTION) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011 goto onError;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005012 if (x==enc_FAILED) { /* unencodable character */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005013 if (charmap_encoding_error(p, size, &inpos, mapping,
5014 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005015 &known_errorHandler, &errorHandler, errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005016 &res, &respos)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00005017 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00005018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005020 else
5021 /* done with this character => adjust input position */
5022 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005025 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005026 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005027 if (_PyBytes_Resize(&res, respos) < 0)
5028 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005030 Py_XDECREF(exc);
5031 Py_XDECREF(errorHandler);
5032 return res;
5033
5034 onError:
5035 Py_XDECREF(res);
5036 Py_XDECREF(exc);
5037 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 return NULL;
5039}
5040
5041PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5042 PyObject *mapping)
5043{
5044 if (!PyUnicode_Check(unicode) || mapping == NULL) {
5045 PyErr_BadArgument();
5046 return NULL;
5047 }
5048 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5049 PyUnicode_GET_SIZE(unicode),
5050 mapping,
5051 NULL);
5052}
5053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005054/* create or adjust a UnicodeTranslateError */
5055static void make_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005056 const Py_UNICODE *unicode, Py_ssize_t size,
5057 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005058 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005060 if (*exceptionObject == NULL) {
5061 *exceptionObject = PyUnicodeTranslateError_Create(
5062 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 }
5064 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005065 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5066 goto onError;
5067 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5068 goto onError;
5069 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5070 goto onError;
5071 return;
5072 onError:
5073 Py_DECREF(*exceptionObject);
5074 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 }
5076}
5077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005078/* raises a UnicodeTranslateError */
5079static void raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005080 const Py_UNICODE *unicode, Py_ssize_t size,
5081 Py_ssize_t startpos, Py_ssize_t endpos,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005082 const char *reason)
5083{
5084 make_translate_exception(exceptionObject,
5085 unicode, size, startpos, endpos, reason);
5086 if (*exceptionObject != NULL)
5087 PyCodec_StrictErrors(*exceptionObject);
5088}
5089
5090/* error handling callback helper:
5091 build arguments, call the callback and check the arguments,
5092 put the result into newpos and return the replacement string, which
5093 has to be freed by the caller */
5094static PyObject *unicode_translate_call_errorhandler(const char *errors,
5095 PyObject **errorHandler,
5096 const char *reason,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005097 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5098 Py_ssize_t startpos, Py_ssize_t endpos,
5099 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005100{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005101 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005103 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005104 PyObject *restuple;
5105 PyObject *resunicode;
5106
5107 if (*errorHandler == NULL) {
5108 *errorHandler = PyCodec_LookupError(errors);
5109 if (*errorHandler == NULL)
5110 return NULL;
5111 }
5112
5113 make_translate_exception(exceptionObject,
5114 unicode, size, startpos, endpos, reason);
5115 if (*exceptionObject == NULL)
5116 return NULL;
5117
5118 restuple = PyObject_CallFunctionObjArgs(
5119 *errorHandler, *exceptionObject, NULL);
5120 if (restuple == NULL)
5121 return NULL;
5122 if (!PyTuple_Check(restuple)) {
5123 PyErr_Format(PyExc_TypeError, &argparse[4]);
5124 Py_DECREF(restuple);
5125 return NULL;
5126 }
5127 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005128 &resunicode, &i_newpos)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005129 Py_DECREF(restuple);
5130 return NULL;
5131 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005132 if (i_newpos<0)
5133 *newpos = size+i_newpos;
5134 else
5135 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005136 if (*newpos<0 || *newpos>size) {
Martin v. Löwis2c95cc62006-02-16 06:54:25 +00005137 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005138 Py_DECREF(restuple);
5139 return NULL;
5140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141 Py_INCREF(resunicode);
5142 Py_DECREF(restuple);
5143 return resunicode;
5144}
5145
5146/* Lookup the character ch in the mapping and put the result in result,
5147 which must be decrefed by the caller.
5148 Return 0 on success, -1 on error */
5149static
5150int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5151{
Christian Heimes217cfd12007-12-02 14:31:20 +00005152 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005153 PyObject *x;
5154
5155 if (w == NULL)
5156 return -1;
5157 x = PyObject_GetItem(mapping, w);
5158 Py_DECREF(w);
5159 if (x == NULL) {
5160 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5161 /* No mapping found means: use 1:1 mapping. */
5162 PyErr_Clear();
5163 *result = NULL;
5164 return 0;
5165 } else
5166 return -1;
5167 }
5168 else if (x == Py_None) {
5169 *result = x;
5170 return 0;
5171 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005172 else if (PyLong_Check(x)) {
5173 long value = PyLong_AS_LONG(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 long max = PyUnicode_GetMax();
5175 if (value < 0 || value > max) {
5176 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005177 "character mapping must be in range(0x%x)", max+1);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005178 Py_DECREF(x);
5179 return -1;
5180 }
5181 *result = x;
5182 return 0;
5183 }
5184 else if (PyUnicode_Check(x)) {
5185 *result = x;
5186 return 0;
5187 }
5188 else {
5189 /* wrong return value */
5190 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00005191 "character mapping must return integer, None or str");
Walter Dörwald150523e2003-08-15 16:52:19 +00005192 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 return -1;
5194 }
5195}
5196/* ensure that *outobj is at least requiredsize characters long,
5197if not reallocate and adjust various state variables.
5198Return 0 on success, -1 on error */
5199static
Walter Dörwald4894c302003-10-24 14:25:28 +00005200int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005201 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005204 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 /* remember old output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005207 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00005208 if (requiredsize < 2 * oldsize)
5209 requiredsize = 2 * oldsize;
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005210 if (PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 return -1;
5212 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 }
5214 return 0;
5215}
5216/* lookup the character, put the result in the output string and adjust
5217 various state variables. Return a new reference to the object that
5218 was put in the output buffer in *result, or Py_None, if the mapping was
5219 undefined (in which case no character was written).
5220 The called must decref result.
5221 Return 0 on success, -1 on error. */
5222static
Walter Dörwald4894c302003-10-24 14:25:28 +00005223int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005224 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald4894c302003-10-24 14:25:28 +00005225 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226{
Walter Dörwald4894c302003-10-24 14:25:28 +00005227 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228 return -1;
5229 if (*res==NULL) {
5230 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00005231 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005232 }
5233 else if (*res==Py_None)
5234 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005235 else if (PyLong_Check(*res)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005236 /* no overflow check, because we know that the space is enough */
Christian Heimes217cfd12007-12-02 14:31:20 +00005237 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 }
5239 else if (PyUnicode_Check(*res)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00005240 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 if (repsize==1) {
5242 /* no overflow check, because we know that the space is enough */
5243 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5244 }
5245 else if (repsize!=0) {
5246 /* more than one character */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005247 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00005248 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00005249 repsize - 1;
5250 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005251 return -1;
5252 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5253 *outp += repsize;
5254 }
5255 }
5256 else
5257 return -1;
5258 return 0;
5259}
5260
5261PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005262 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 PyObject *mapping,
5264 const char *errors)
5265{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005266 /* output object */
5267 PyObject *res = NULL;
5268 /* pointers to the beginning and end+1 of input */
5269 const Py_UNICODE *startp = p;
5270 const Py_UNICODE *endp = p + size;
5271 /* pointer into the output */
5272 Py_UNICODE *str;
5273 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005274 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275 char *reason = "character maps to <undefined>";
5276 PyObject *errorHandler = NULL;
5277 PyObject *exc = NULL;
5278 /* the following variable is used for caching string comparisons
5279 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5280 * 3=ignore, 4=xmlcharrefreplace */
5281 int known_errorHandler = -1;
5282
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 if (mapping == NULL) {
5284 PyErr_BadArgument();
5285 return NULL;
5286 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287
5288 /* allocate enough for a simple 1:1 translation without
5289 replacements, if we need more, we'll resize */
5290 res = PyUnicode_FromUnicode(NULL, size);
5291 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00005292 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294 return res;
5295 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005297 while (p<endp) {
5298 /* try to encode it */
5299 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00005300 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005301 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 goto onError;
5303 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005304 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305 if (x!=Py_None) /* it worked => adjust input pointer */
5306 ++p;
5307 else { /* untranslatable character */
5308 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005309 Py_ssize_t repsize;
5310 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005311 Py_UNICODE *uni2;
5312 /* startpos for collecting untranslatable chars */
5313 const Py_UNICODE *collstart = p;
5314 const Py_UNICODE *collend = p+1;
5315 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005317 /* find all untranslatable characters */
5318 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00005319 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320 goto onError;
5321 Py_XDECREF(x);
5322 if (x!=Py_None)
5323 break;
5324 ++collend;
5325 }
5326 /* cache callback name lookup
5327 * (if not done yet, i.e. it's the first error) */
5328 if (known_errorHandler==-1) {
5329 if ((errors==NULL) || (!strcmp(errors, "strict")))
5330 known_errorHandler = 1;
5331 else if (!strcmp(errors, "replace"))
5332 known_errorHandler = 2;
5333 else if (!strcmp(errors, "ignore"))
5334 known_errorHandler = 3;
5335 else if (!strcmp(errors, "xmlcharrefreplace"))
5336 known_errorHandler = 4;
5337 else
5338 known_errorHandler = 0;
5339 }
5340 switch (known_errorHandler) {
5341 case 1: /* strict */
5342 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5343 goto onError;
5344 case 2: /* replace */
5345 /* No need to check for space, this is a 1:1 replacement */
5346 for (coll = collstart; coll<collend; ++coll)
5347 *str++ = '?';
5348 /* fall through */
5349 case 3: /* ignore */
5350 p = collend;
5351 break;
5352 case 4: /* xmlcharrefreplace */
5353 /* generate replacement (temporarily (mis)uses p) */
5354 for (p = collstart; p < collend; ++p) {
5355 char buffer[2+29+1+1];
5356 char *cp;
5357 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00005358 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5360 goto onError;
5361 for (cp = buffer; *cp; ++cp)
5362 *str++ = *cp;
5363 }
5364 p = collend;
5365 break;
5366 default:
5367 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5368 reason, startp, size, &exc,
5369 collstart-startp, collend-startp, &newpos);
5370 if (repunicode == NULL)
5371 goto onError;
5372 /* generate replacement */
5373 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00005374 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5376 Py_DECREF(repunicode);
5377 goto onError;
5378 }
5379 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5380 *str++ = *uni2;
5381 p = startp + newpos;
5382 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 }
5384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 /* Resize if we allocated to much */
5387 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005388 if (respos<PyUnicode_GET_SIZE(res)) {
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005389 if (PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005390 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 }
5392 Py_XDECREF(exc);
5393 Py_XDECREF(errorHandler);
5394 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 onError:
5397 Py_XDECREF(res);
5398 Py_XDECREF(exc);
5399 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 return NULL;
5401}
5402
5403PyObject *PyUnicode_Translate(PyObject *str,
5404 PyObject *mapping,
5405 const char *errors)
5406{
5407 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 str = PyUnicode_FromObject(str);
5410 if (str == NULL)
5411 goto onError;
5412 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5413 PyUnicode_GET_SIZE(str),
5414 mapping,
5415 errors);
5416 Py_DECREF(str);
5417 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005418
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 onError:
5420 Py_XDECREF(str);
5421 return NULL;
5422}
Tim Petersced69f82003-09-16 20:30:58 +00005423
Guido van Rossum9e896b32000-04-05 20:11:21 +00005424/* --- Decimal Encoder ---------------------------------------------------- */
5425
5426int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005427 Py_ssize_t length,
Guido van Rossum9e896b32000-04-05 20:11:21 +00005428 char *output,
5429 const char *errors)
5430{
5431 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 PyObject *errorHandler = NULL;
5433 PyObject *exc = NULL;
5434 const char *encoding = "decimal";
5435 const char *reason = "invalid decimal Unicode string";
5436 /* the following variable is used for caching string comparisons
5437 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5438 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005439
5440 if (output == NULL) {
5441 PyErr_BadArgument();
5442 return -1;
5443 }
5444
5445 p = s;
5446 end = s + length;
5447 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005448 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005449 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 PyObject *repunicode;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005451 Py_ssize_t repsize;
5452 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 Py_UNICODE *uni2;
5454 Py_UNICODE *collstart;
5455 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005456
Guido van Rossum9e896b32000-04-05 20:11:21 +00005457 if (Py_UNICODE_ISSPACE(ch)) {
5458 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005460 continue;
5461 }
5462 decimal = Py_UNICODE_TODECIMAL(ch);
5463 if (decimal >= 0) {
5464 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005465 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005466 continue;
5467 }
Guido van Rossumba477042000-04-06 18:18:10 +00005468 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00005469 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005470 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005471 continue;
5472 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 /* All other characters are considered unencodable */
5474 collstart = p;
5475 collend = p+1;
5476 while (collend < end) {
5477 if ((0 < *collend && *collend < 256) ||
5478 !Py_UNICODE_ISSPACE(*collend) ||
5479 Py_UNICODE_TODECIMAL(*collend))
5480 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005481 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 /* cache callback name lookup
5483 * (if not done yet, i.e. it's the first error) */
5484 if (known_errorHandler==-1) {
5485 if ((errors==NULL) || (!strcmp(errors, "strict")))
5486 known_errorHandler = 1;
5487 else if (!strcmp(errors, "replace"))
5488 known_errorHandler = 2;
5489 else if (!strcmp(errors, "ignore"))
5490 known_errorHandler = 3;
5491 else if (!strcmp(errors, "xmlcharrefreplace"))
5492 known_errorHandler = 4;
5493 else
5494 known_errorHandler = 0;
5495 }
5496 switch (known_errorHandler) {
5497 case 1: /* strict */
5498 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5499 goto onError;
5500 case 2: /* replace */
5501 for (p = collstart; p < collend; ++p)
5502 *output++ = '?';
5503 /* fall through */
5504 case 3: /* ignore */
5505 p = collend;
5506 break;
5507 case 4: /* xmlcharrefreplace */
5508 /* generate replacement (temporarily (mis)uses p) */
5509 for (p = collstart; p < collend; ++p)
5510 output += sprintf(output, "&#%d;", (int)*p);
5511 p = collend;
5512 break;
5513 default:
5514 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5515 encoding, reason, s, length, &exc,
5516 collstart-s, collend-s, &newpos);
5517 if (repunicode == NULL)
5518 goto onError;
5519 /* generate replacement */
5520 repsize = PyUnicode_GET_SIZE(repunicode);
5521 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5522 Py_UNICODE ch = *uni2;
5523 if (Py_UNICODE_ISSPACE(ch))
5524 *output++ = ' ';
5525 else {
5526 decimal = Py_UNICODE_TODECIMAL(ch);
5527 if (decimal >= 0)
5528 *output++ = '0' + decimal;
5529 else if (0 < ch && ch < 256)
5530 *output++ = (char)ch;
5531 else {
5532 Py_DECREF(repunicode);
5533 raise_encode_exception(&exc, encoding,
5534 s, length, collstart-s, collend-s, reason);
5535 goto onError;
5536 }
5537 }
5538 }
5539 p = s + newpos;
5540 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005541 }
5542 }
5543 /* 0-terminate the output string */
5544 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545 Py_XDECREF(exc);
5546 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005547 return 0;
5548
5549 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 Py_XDECREF(exc);
5551 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005552 return -1;
5553}
5554
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555/* --- Helpers ------------------------------------------------------------ */
5556
Eric Smith8c663262007-08-25 02:26:07 +00005557#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005558#include "stringlib/fastsearch.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00005559#include "stringlib/count.h"
Christian Heimes9cd17752007-11-18 19:35:23 +00005560/* Include _ParseTupleFinds from find.h */
5561#define FROM_UNICODE
Thomas Wouters477c8d52006-05-27 19:21:47 +00005562#include "stringlib/find.h"
5563#include "stringlib/partition.h"
5564
Eric Smith5807c412008-05-11 21:00:57 +00005565#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5566#include "stringlib/localeutil.h"
5567
Thomas Wouters477c8d52006-05-27 19:21:47 +00005568/* helper macro to fixup start/end slice values */
5569#define FIX_START_END(obj) \
5570 if (start < 0) \
5571 start += (obj)->length; \
5572 if (start < 0) \
5573 start = 0; \
5574 if (end > (obj)->length) \
5575 end = (obj)->length; \
5576 if (end < 0) \
5577 end += (obj)->length; \
5578 if (end < 0) \
5579 end = 0;
5580
Martin v. Löwis18e16552006-02-15 17:27:45 +00005581Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005582 PyObject *substr,
5583 Py_ssize_t start,
5584 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005586 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005587 PyUnicodeObject* str_obj;
5588 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005589
Thomas Wouters477c8d52006-05-27 19:21:47 +00005590 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5591 if (!str_obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005593 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5594 if (!sub_obj) {
5595 Py_DECREF(str_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596 return -1;
5597 }
Tim Petersced69f82003-09-16 20:30:58 +00005598
Thomas Wouters477c8d52006-05-27 19:21:47 +00005599 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005600
Thomas Wouters477c8d52006-05-27 19:21:47 +00005601 result = stringlib_count(
5602 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5603 );
5604
5605 Py_DECREF(sub_obj);
5606 Py_DECREF(str_obj);
5607
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 return result;
5609}
5610
Martin v. Löwis18e16552006-02-15 17:27:45 +00005611Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00005612 PyObject *sub,
5613 Py_ssize_t start,
5614 Py_ssize_t end,
5615 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005617 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005618
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005620 if (!str)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005621 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005622 sub = PyUnicode_FromObject(sub);
5623 if (!sub) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00005624 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00005625 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 }
Tim Petersced69f82003-09-16 20:30:58 +00005627
Thomas Wouters477c8d52006-05-27 19:21:47 +00005628 if (direction > 0)
5629 result = stringlib_find_slice(
5630 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5631 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5632 start, end
5633 );
5634 else
5635 result = stringlib_rfind_slice(
5636 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5637 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5638 start, end
5639 );
5640
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00005642 Py_DECREF(sub);
5643
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return result;
5645}
5646
Tim Petersced69f82003-09-16 20:30:58 +00005647static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648int tailmatch(PyUnicodeObject *self,
5649 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005650 Py_ssize_t start,
5651 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 int direction)
5653{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 if (substring->length == 0)
5655 return 1;
5656
Thomas Wouters477c8d52006-05-27 19:21:47 +00005657 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
5659 end -= substring->length;
5660 if (end < start)
5661 return 0;
5662
5663 if (direction > 0) {
5664 if (Py_UNICODE_MATCH(self, end, substring))
5665 return 1;
5666 } else {
5667 if (Py_UNICODE_MATCH(self, start, substring))
5668 return 1;
5669 }
5670
5671 return 0;
5672}
5673
Martin v. Löwis18e16552006-02-15 17:27:45 +00005674Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 PyObject *substr,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005676 Py_ssize_t start,
5677 Py_ssize_t end,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 int direction)
5679{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005680 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 str = PyUnicode_FromObject(str);
5683 if (str == NULL)
5684 return -1;
5685 substr = PyUnicode_FromObject(substr);
5686 if (substr == NULL) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005687 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 return -1;
5689 }
Tim Petersced69f82003-09-16 20:30:58 +00005690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 result = tailmatch((PyUnicodeObject *)str,
5692 (PyUnicodeObject *)substr,
5693 start, end, direction);
5694 Py_DECREF(str);
5695 Py_DECREF(substr);
5696 return result;
5697}
5698
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699/* Apply fixfct filter to the Unicode object self and return a
5700 reference to the modified object */
5701
Tim Petersced69f82003-09-16 20:30:58 +00005702static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703PyObject *fixup(PyUnicodeObject *self,
5704 int (*fixfct)(PyUnicodeObject *s))
5705{
5706
5707 PyUnicodeObject *u;
5708
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005709 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 if (u == NULL)
5711 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005712
5713 Py_UNICODE_COPY(u->str, self->str, self->length);
5714
Tim Peters7a29bd52001-09-12 03:03:31 +00005715 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 /* fixfct should return TRUE if it modified the buffer. If
5717 FALSE, return a reference to the original buffer instead
5718 (to save space, not time) */
5719 Py_INCREF(self);
5720 Py_DECREF(u);
5721 return (PyObject*) self;
5722 }
5723 return (PyObject*) u;
5724}
5725
Tim Petersced69f82003-09-16 20:30:58 +00005726static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727int fixupper(PyUnicodeObject *self)
5728{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 Py_UNICODE *s = self->str;
5731 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005732
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 while (len-- > 0) {
5734 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005735
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 ch = Py_UNICODE_TOUPPER(*s);
5737 if (ch != *s) {
5738 status = 1;
5739 *s = ch;
5740 }
5741 s++;
5742 }
5743
5744 return status;
5745}
5746
Tim Petersced69f82003-09-16 20:30:58 +00005747static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748int fixlower(PyUnicodeObject *self)
5749{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005750 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 Py_UNICODE *s = self->str;
5752 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005753
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 while (len-- > 0) {
5755 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005756
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 ch = Py_UNICODE_TOLOWER(*s);
5758 if (ch != *s) {
5759 status = 1;
5760 *s = ch;
5761 }
5762 s++;
5763 }
5764
5765 return status;
5766}
5767
Tim Petersced69f82003-09-16 20:30:58 +00005768static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769int fixswapcase(PyUnicodeObject *self)
5770{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005771 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 Py_UNICODE *s = self->str;
5773 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005774
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 while (len-- > 0) {
5776 if (Py_UNICODE_ISUPPER(*s)) {
5777 *s = Py_UNICODE_TOLOWER(*s);
5778 status = 1;
5779 } else if (Py_UNICODE_ISLOWER(*s)) {
5780 *s = Py_UNICODE_TOUPPER(*s);
5781 status = 1;
5782 }
5783 s++;
5784 }
5785
5786 return status;
5787}
5788
Tim Petersced69f82003-09-16 20:30:58 +00005789static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790int fixcapitalize(PyUnicodeObject *self)
5791{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005792 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005793 Py_UNICODE *s = self->str;
5794 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005795
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005796 if (len == 0)
5797 return 0;
5798 if (Py_UNICODE_ISLOWER(*s)) {
5799 *s = Py_UNICODE_TOUPPER(*s);
5800 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005802 s++;
5803 while (--len > 0) {
5804 if (Py_UNICODE_ISUPPER(*s)) {
5805 *s = Py_UNICODE_TOLOWER(*s);
5806 status = 1;
5807 }
5808 s++;
5809 }
5810 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811}
5812
5813static
5814int fixtitle(PyUnicodeObject *self)
5815{
5816 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5817 register Py_UNICODE *e;
5818 int previous_is_cased;
5819
5820 /* Shortcut for single character strings */
5821 if (PyUnicode_GET_SIZE(self) == 1) {
5822 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5823 if (*p != ch) {
5824 *p = ch;
5825 return 1;
5826 }
5827 else
5828 return 0;
5829 }
Tim Petersced69f82003-09-16 20:30:58 +00005830
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 e = p + PyUnicode_GET_SIZE(self);
5832 previous_is_cased = 0;
5833 for (; p < e; p++) {
5834 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005835
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 if (previous_is_cased)
5837 *p = Py_UNICODE_TOLOWER(ch);
5838 else
5839 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005840
5841 if (Py_UNICODE_ISLOWER(ch) ||
5842 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 Py_UNICODE_ISTITLE(ch))
5844 previous_is_cased = 1;
5845 else
5846 previous_is_cased = 0;
5847 }
5848 return 1;
5849}
5850
Tim Peters8ce9f162004-08-27 01:49:32 +00005851PyObject *
5852PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853{
Skip Montanaro6543b452004-09-16 03:28:13 +00005854 const Py_UNICODE blank = ' ';
5855 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00005856 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005857 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00005858 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5859 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005860 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5861 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00005862 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005863 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
Tim Peters05eba1f2004-08-27 21:32:02 +00005865 fseq = PySequence_Fast(seq, "");
5866 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00005867 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005868 }
5869
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005870 /* NOTE: the following code can't call back into Python code,
5871 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00005872 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005873
Tim Peters05eba1f2004-08-27 21:32:02 +00005874 seqlen = PySequence_Fast_GET_SIZE(fseq);
5875 /* If empty sequence, return u"". */
5876 if (seqlen == 0) {
5877 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5878 goto Done;
5879 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005880 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005881 /* If singleton sequence with an exact Unicode, return that. */
5882 if (seqlen == 1) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005883 item = items[0];
Tim Peters05eba1f2004-08-27 21:32:02 +00005884 if (PyUnicode_CheckExact(item)) {
5885 Py_INCREF(item);
5886 res = (PyUnicodeObject *)item;
5887 goto Done;
5888 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005889 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005890 else {
5891 /* Set up sep and seplen */
5892 if (separator == NULL) {
5893 sep = &blank;
5894 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005895 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005896 else {
5897 if (!PyUnicode_Check(separator)) {
5898 PyErr_Format(PyExc_TypeError,
5899 "separator: expected str instance,"
5900 " %.80s found",
5901 Py_TYPE(separator)->tp_name);
5902 goto onError;
5903 }
5904 sep = PyUnicode_AS_UNICODE(separator);
5905 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005906 }
5907 }
5908
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005909 /* There are at least two things to join, or else we have a subclass
5910 * of str in the sequence.
5911 * Do a pre-pass to figure out the total amount of space we'll
5912 * need (sz), and see whether all argument are strings.
5913 */
5914 sz = 0;
5915 for (i = 0; i < seqlen; i++) {
5916 const Py_ssize_t old_sz = sz;
5917 item = items[i];
Guido van Rossum98297ee2007-11-06 21:34:58 +00005918 if (!PyUnicode_Check(item)) {
5919 PyErr_Format(PyExc_TypeError,
5920 "sequence item %zd: expected str instance,"
5921 " %.80s found",
Christian Heimes90aa7642007-12-19 02:45:37 +00005922 i, Py_TYPE(item)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005923 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005924 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005925 sz += PyUnicode_GET_SIZE(item);
5926 if (i != 0)
5927 sz += seplen;
5928 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5929 PyErr_SetString(PyExc_OverflowError,
5930 "join() result is too long for a Python string");
5931 goto onError;
5932 }
5933 }
Tim Petersced69f82003-09-16 20:30:58 +00005934
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005935 res = _PyUnicode_New(sz);
5936 if (res == NULL)
5937 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00005938
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005939 /* Catenate everything. */
5940 res_p = PyUnicode_AS_UNICODE(res);
5941 for (i = 0; i < seqlen; ++i) {
5942 Py_ssize_t itemlen;
5943 item = items[i];
5944 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00005945 /* Copy item, and maybe the separator. */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005946 if (i) {
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005947 Py_UNICODE_COPY(res_p, sep, seplen);
Tim Peters05eba1f2004-08-27 21:32:02 +00005948 res_p += seplen;
5949 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00005950 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5951 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00005952 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005953
Tim Peters8ce9f162004-08-27 01:49:32 +00005954 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00005955 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 return (PyObject *)res;
5957
5958 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00005959 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005960 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 return NULL;
5962}
5963
Tim Petersced69f82003-09-16 20:30:58 +00005964static
5965PyUnicodeObject *pad(PyUnicodeObject *self,
Martin v. Löwis18e16552006-02-15 17:27:45 +00005966 Py_ssize_t left,
5967 Py_ssize_t right,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 Py_UNICODE fill)
5969{
5970 PyUnicodeObject *u;
5971
5972 if (left < 0)
5973 left = 0;
5974 if (right < 0)
5975 right = 0;
5976
Tim Peters7a29bd52001-09-12 03:03:31 +00005977 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 Py_INCREF(self);
5979 return self;
5980 }
5981
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005982 if (left > PY_SSIZE_T_MAX - self->length ||
5983 right > PY_SSIZE_T_MAX - (left + self->length)) {
5984 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5985 return NULL;
5986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 u = _PyUnicode_New(left + self->length + right);
5988 if (u) {
5989 if (left)
5990 Py_UNICODE_FILL(u->str, fill, left);
5991 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5992 if (right)
5993 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5994 }
5995
5996 return u;
5997}
5998
5999#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006000 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 if (!str) \
6002 goto onError; \
6003 if (PyList_Append(list, str)) { \
6004 Py_DECREF(str); \
6005 goto onError; \
6006 } \
6007 else \
6008 Py_DECREF(str);
6009
6010static
6011PyObject *split_whitespace(PyUnicodeObject *self,
6012 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006013 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006015 register Py_ssize_t i;
6016 register Py_ssize_t j;
6017 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006019 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
6021 for (i = j = 0; i < len; ) {
6022 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00006023 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 i++;
6025 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00006026 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 i++;
6028 if (j < i) {
6029 if (maxcount-- <= 0)
6030 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006031 SPLIT_APPEND(buf, j, i);
6032 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 i++;
6034 j = i;
6035 }
6036 }
6037 if (j < len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006038 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 }
6040 return list;
6041
6042 onError:
6043 Py_DECREF(list);
6044 return NULL;
6045}
6046
6047PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00006048 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006050 register Py_ssize_t i;
6051 register Py_ssize_t j;
6052 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 PyObject *list;
6054 PyObject *str;
6055 Py_UNICODE *data;
6056
6057 string = PyUnicode_FromObject(string);
6058 if (string == NULL)
6059 return NULL;
6060 data = PyUnicode_AS_UNICODE(string);
6061 len = PyUnicode_GET_SIZE(string);
6062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 list = PyList_New(0);
6064 if (!list)
6065 goto onError;
6066
6067 for (i = j = 0; i < len; ) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00006068 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00006069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 /* Find a line and append it */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006071 while (i < len && !BLOOM_LINEBREAK(data[i]))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
6074 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00006075 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 if (i < len) {
6077 if (data[i] == '\r' && i + 1 < len &&
6078 data[i+1] == '\n')
6079 i += 2;
6080 else
6081 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00006082 if (keepends)
6083 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 }
Guido van Rossum86662912000-04-11 15:38:46 +00006085 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 j = i;
6087 }
6088 if (j < len) {
6089 SPLIT_APPEND(data, j, len);
6090 }
6091
6092 Py_DECREF(string);
6093 return list;
6094
6095 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006096 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 Py_DECREF(string);
6098 return NULL;
6099}
6100
Tim Petersced69f82003-09-16 20:30:58 +00006101static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102PyObject *split_char(PyUnicodeObject *self,
6103 PyObject *list,
6104 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006105 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006107 register Py_ssize_t i;
6108 register Py_ssize_t j;
6109 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006111 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112
6113 for (i = j = 0; i < len; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006114 if (buf[i] == ch) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 if (maxcount-- <= 0)
6116 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006117 SPLIT_APPEND(buf, j, i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 i = j = i + 1;
6119 } else
6120 i++;
6121 }
6122 if (j <= len) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006123 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
6125 return list;
6126
6127 onError:
6128 Py_DECREF(list);
6129 return NULL;
6130}
6131
Tim Petersced69f82003-09-16 20:30:58 +00006132static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133PyObject *split_substring(PyUnicodeObject *self,
6134 PyObject *list,
6135 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006136 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006138 register Py_ssize_t i;
6139 register Py_ssize_t j;
6140 Py_ssize_t len = self->length;
6141 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 PyObject *str;
6143
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00006144 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 if (Py_UNICODE_MATCH(self, i, substring)) {
6146 if (maxcount-- <= 0)
6147 break;
6148 SPLIT_APPEND(self->str, j, i);
6149 i = j = i + sublen;
6150 } else
6151 i++;
6152 }
6153 if (j <= len) {
6154 SPLIT_APPEND(self->str, j, len);
6155 }
6156 return list;
6157
6158 onError:
6159 Py_DECREF(list);
6160 return NULL;
6161}
6162
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006163static
6164PyObject *rsplit_whitespace(PyUnicodeObject *self,
6165 PyObject *list,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006166 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168 register Py_ssize_t i;
6169 register Py_ssize_t j;
6170 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006171 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006172 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006173
6174 for (i = j = len - 1; i >= 0; ) {
6175 /* find a token */
Christian Heimes190d79e2008-01-30 11:58:22 +00006176 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006177 i--;
6178 j = i;
Christian Heimes190d79e2008-01-30 11:58:22 +00006179 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006180 i--;
6181 if (j > i) {
6182 if (maxcount-- <= 0)
6183 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006184 SPLIT_APPEND(buf, i + 1, j + 1);
6185 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006186 i--;
6187 j = i;
6188 }
6189 }
6190 if (j >= 0) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006191 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006192 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006193 if (PyList_Reverse(list) < 0)
6194 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006195 return list;
6196
6197 onError:
6198 Py_DECREF(list);
6199 return NULL;
6200}
6201
6202static
6203PyObject *rsplit_char(PyUnicodeObject *self,
6204 PyObject *list,
6205 Py_UNICODE ch,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006206 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006208 register Py_ssize_t i;
6209 register Py_ssize_t j;
6210 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006211 PyObject *str;
Christian Heimes190d79e2008-01-30 11:58:22 +00006212 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006213
6214 for (i = j = len - 1; i >= 0; ) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006215 if (buf[i] == ch) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006216 if (maxcount-- <= 0)
6217 break;
Christian Heimes190d79e2008-01-30 11:58:22 +00006218 SPLIT_APPEND(buf, i + 1, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006219 j = i = i - 1;
6220 } else
6221 i--;
6222 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00006223 if (j >= -1) {
Christian Heimes190d79e2008-01-30 11:58:22 +00006224 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006225 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006226 if (PyList_Reverse(list) < 0)
6227 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006228 return list;
6229
6230 onError:
6231 Py_DECREF(list);
6232 return NULL;
6233}
6234
6235static
6236PyObject *rsplit_substring(PyUnicodeObject *self,
6237 PyObject *list,
6238 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006239 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006240{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 register Py_ssize_t i;
6242 register Py_ssize_t j;
6243 Py_ssize_t len = self->length;
6244 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006245 PyObject *str;
6246
6247 for (i = len - sublen, j = len; i >= 0; ) {
6248 if (Py_UNICODE_MATCH(self, i, substring)) {
6249 if (maxcount-- <= 0)
6250 break;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006251 SPLIT_APPEND(self->str, i + sublen, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006252 j = i;
6253 i -= sublen;
6254 } else
6255 i--;
6256 }
6257 if (j >= 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006258 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006259 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006260 if (PyList_Reverse(list) < 0)
6261 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006262 return list;
6263
6264 onError:
6265 Py_DECREF(list);
6266 return NULL;
6267}
6268
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269#undef SPLIT_APPEND
6270
6271static
6272PyObject *split(PyUnicodeObject *self,
6273 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006274 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275{
6276 PyObject *list;
6277
6278 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006279 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
6281 list = PyList_New(0);
6282 if (!list)
6283 return NULL;
6284
6285 if (substring == NULL)
6286 return split_whitespace(self,list,maxcount);
6287
6288 else if (substring->length == 1)
6289 return split_char(self,list,substring->str[0],maxcount);
6290
6291 else if (substring->length == 0) {
6292 Py_DECREF(list);
6293 PyErr_SetString(PyExc_ValueError, "empty separator");
6294 return NULL;
6295 }
6296 else
6297 return split_substring(self,list,substring,maxcount);
6298}
6299
Tim Petersced69f82003-09-16 20:30:58 +00006300static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006301PyObject *rsplit(PyUnicodeObject *self,
6302 PyUnicodeObject *substring,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006303 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006304{
6305 PyObject *list;
6306
6307 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006308 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006309
6310 list = PyList_New(0);
6311 if (!list)
6312 return NULL;
6313
6314 if (substring == NULL)
6315 return rsplit_whitespace(self,list,maxcount);
6316
6317 else if (substring->length == 1)
6318 return rsplit_char(self,list,substring->str[0],maxcount);
6319
6320 else if (substring->length == 0) {
6321 Py_DECREF(list);
6322 PyErr_SetString(PyExc_ValueError, "empty separator");
6323 return NULL;
6324 }
6325 else
6326 return rsplit_substring(self,list,substring,maxcount);
6327}
6328
6329static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330PyObject *replace(PyUnicodeObject *self,
6331 PyUnicodeObject *str1,
6332 PyUnicodeObject *str2,
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334{
6335 PyUnicodeObject *u;
6336
6337 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006338 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339
Thomas Wouters477c8d52006-05-27 19:21:47 +00006340 if (str1->length == str2->length) {
6341 /* same length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006342 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006343 if (str1->length == 1) {
6344 /* replace characters */
6345 Py_UNICODE u1, u2;
6346 if (!findchar(self->str, self->length, str1->str[0]))
6347 goto nothing;
6348 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6349 if (!u)
6350 return NULL;
6351 Py_UNICODE_COPY(u->str, self->str, self->length);
6352 u1 = str1->str[0];
6353 u2 = str2->str[0];
6354 for (i = 0; i < u->length; i++)
6355 if (u->str[i] == u1) {
6356 if (--maxcount < 0)
6357 break;
6358 u->str[i] = u2;
6359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006361 i = fastsearch(
6362 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006364 if (i < 0)
6365 goto nothing;
6366 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6367 if (!u)
6368 return NULL;
6369 Py_UNICODE_COPY(u->str, self->str, self->length);
6370 while (i <= self->length - str1->length)
6371 if (Py_UNICODE_MATCH(self, i, str1)) {
6372 if (--maxcount < 0)
6373 break;
6374 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6375 i += str1->length;
6376 } else
6377 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006380
6381 Py_ssize_t n, i, j, e;
6382 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 Py_UNICODE *p;
6384
6385 /* replace strings */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006386 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 if (n > maxcount)
6388 n = maxcount;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006389 if (n == 0)
6390 goto nothing;
6391 /* new_size = self->length + n * (str2->length - str1->length)); */
6392 delta = (str2->length - str1->length);
6393 if (delta == 0) {
6394 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006396 product = n * (str2->length - str1->length);
6397 if ((product / (str2->length - str1->length)) != n) {
6398 PyErr_SetString(PyExc_OverflowError,
6399 "replace string is too long");
6400 return NULL;
6401 }
6402 new_size = self->length + product;
6403 if (new_size < 0) {
6404 PyErr_SetString(PyExc_OverflowError,
6405 "replace string is too long");
6406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407 }
6408 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006409 u = _PyUnicode_New(new_size);
6410 if (!u)
6411 return NULL;
6412 i = 0;
6413 p = u->str;
6414 e = self->length - str1->length;
6415 if (str1->length > 0) {
6416 while (n-- > 0) {
6417 /* look for next match */
6418 j = i;
6419 while (j <= e) {
6420 if (Py_UNICODE_MATCH(self, j, str1))
6421 break;
6422 j++;
6423 }
6424 if (j > i) {
6425 if (j > e)
6426 break;
6427 /* copy unchanged part [i:j] */
6428 Py_UNICODE_COPY(p, self->str+i, j-i);
6429 p += j - i;
6430 }
6431 /* copy substitution string */
6432 if (str2->length > 0) {
6433 Py_UNICODE_COPY(p, str2->str, str2->length);
6434 p += str2->length;
6435 }
6436 i = j + str1->length;
6437 }
6438 if (i < self->length)
6439 /* copy tail [i:] */
6440 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6441 } else {
6442 /* interleave */
6443 while (n > 0) {
6444 Py_UNICODE_COPY(p, str2->str, str2->length);
6445 p += str2->length;
6446 if (--n <= 0)
6447 break;
6448 *p++ = self->str[i++];
6449 }
6450 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006454
6455nothing:
6456 /* nothing to replace; return original string (when possible) */
6457 if (PyUnicode_CheckExact(self)) {
6458 Py_INCREF(self);
6459 return (PyObject *) self;
6460 }
6461 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462}
6463
6464/* --- Unicode Object Methods --------------------------------------------- */
6465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006466PyDoc_STRVAR(title__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006467"S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468\n\
6469Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006470characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
6472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006473unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 return fixup(self, fixtitle);
6476}
6477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006478PyDoc_STRVAR(capitalize__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006479"S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480\n\
6481Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006482have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
6484static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006485unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 return fixup(self, fixcapitalize);
6488}
6489
6490#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006491PyDoc_STRVAR(capwords__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006492"S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493\n\
6494Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006495normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496
6497static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006498unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499{
6500 PyObject *list;
6501 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006502 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 /* Split into words */
6505 list = split(self, NULL, -1);
6506 if (!list)
6507 return NULL;
6508
6509 /* Capitalize each word */
6510 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6511 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6512 fixcapitalize);
6513 if (item == NULL)
6514 goto onError;
6515 Py_DECREF(PyList_GET_ITEM(list, i));
6516 PyList_SET_ITEM(list, i, item);
6517 }
6518
6519 /* Join the words to form a new string */
6520 item = PyUnicode_Join(NULL, list);
6521
6522onError:
6523 Py_DECREF(list);
6524 return (PyObject *)item;
6525}
6526#endif
6527
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006528/* Argument converter. Coerces to a single unicode character */
6529
6530static int
6531convert_uc(PyObject *obj, void *addr)
6532{
6533 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6534 PyObject *uniobj;
6535 Py_UNICODE *unistr;
6536
6537 uniobj = PyUnicode_FromObject(obj);
6538 if (uniobj == NULL) {
6539 PyErr_SetString(PyExc_TypeError,
6540 "The fill character cannot be converted to Unicode");
6541 return 0;
6542 }
6543 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6544 PyErr_SetString(PyExc_TypeError,
6545 "The fill character must be exactly one character long");
6546 Py_DECREF(uniobj);
6547 return 0;
6548 }
6549 unistr = PyUnicode_AS_UNICODE(uniobj);
6550 *fillcharloc = unistr[0];
6551 Py_DECREF(uniobj);
6552 return 1;
6553}
6554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006555PyDoc_STRVAR(center__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006556"S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006558Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006559done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560
6561static PyObject *
6562unicode_center(PyUnicodeObject *self, PyObject *args)
6563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564 Py_ssize_t marg, left;
6565 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006566 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
Thomas Woutersde017742006-02-16 19:34:37 +00006568 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 return NULL;
6570
Tim Peters7a29bd52001-09-12 03:03:31 +00006571 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 Py_INCREF(self);
6573 return (PyObject*) self;
6574 }
6575
6576 marg = width - self->length;
6577 left = marg / 2 + (marg & width & 1);
6578
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006579 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580}
6581
Marc-André Lemburge5034372000-08-08 08:04:29 +00006582#if 0
6583
6584/* This code should go into some future Unicode collation support
6585 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006586 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006587
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006588/* speedy UTF-16 code point order comparison */
6589/* gleaned from: */
6590/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6591
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006592static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006593{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006594 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006595 0, 0, 0, 0, 0, 0, 0, 0,
6596 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006597 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006598};
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600static int
6601unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6602{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 Py_UNICODE *s1 = str1->str;
6606 Py_UNICODE *s2 = str2->str;
6607
6608 len1 = str1->length;
6609 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006610
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006612 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006613
6614 c1 = *s1++;
6615 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006616
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006617 if (c1 > (1<<11) * 26)
6618 c1 += utf16Fixup[c1>>11];
6619 if (c2 > (1<<11) * 26)
6620 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006621 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006622
6623 if (c1 != c2)
6624 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006625
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006626 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
6628
6629 return (len1 < len2) ? -1 : (len1 != len2);
6630}
6631
Marc-André Lemburge5034372000-08-08 08:04:29 +00006632#else
6633
6634static int
6635unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6636{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006637 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006638
6639 Py_UNICODE *s1 = str1->str;
6640 Py_UNICODE *s2 = str2->str;
6641
6642 len1 = str1->length;
6643 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006644
Marc-André Lemburge5034372000-08-08 08:04:29 +00006645 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006646 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006647
Fredrik Lundh45714e92001-06-26 16:39:36 +00006648 c1 = *s1++;
6649 c2 = *s2++;
6650
6651 if (c1 != c2)
6652 return (c1 < c2) ? -1 : 1;
6653
Marc-André Lemburge5034372000-08-08 08:04:29 +00006654 len1--; len2--;
6655 }
6656
6657 return (len1 < len2) ? -1 : (len1 != len2);
6658}
6659
6660#endif
6661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662int PyUnicode_Compare(PyObject *left,
6663 PyObject *right)
6664{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006665 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6666 return unicode_compare((PyUnicodeObject *)left,
6667 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006668 PyErr_Format(PyExc_TypeError,
6669 "Can't compare %.100s and %.100s",
6670 left->ob_type->tp_name,
6671 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 return -1;
6673}
6674
Martin v. Löwis5b222132007-06-10 09:51:05 +00006675int
6676PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6677{
6678 int i;
6679 Py_UNICODE *id;
6680 assert(PyUnicode_Check(uni));
6681 id = PyUnicode_AS_UNICODE(uni);
6682 /* Compare Unicode string and source character set string */
6683 for (i = 0; id[i] && str[i]; i++)
6684 if (id[i] != str[i])
6685 return ((int)id[i] < (int)str[i]) ? -1 : 1;
6686 if (id[i])
6687 return 1; /* uni is longer */
6688 if (str[i])
6689 return -1; /* str is longer */
6690 return 0;
6691}
6692
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006693
6694#define TEST_COND(cond) \
6695 ((cond) ? Py_True : Py_False)
6696
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006697PyObject *PyUnicode_RichCompare(PyObject *left,
6698 PyObject *right,
6699 int op)
6700{
6701 int result;
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006702
6703 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6704 PyObject *v;
6705 if (((PyUnicodeObject *) left)->length !=
6706 ((PyUnicodeObject *) right)->length) {
6707 if (op == Py_EQ) {
6708 Py_INCREF(Py_False);
6709 return Py_False;
6710 }
6711 if (op == Py_NE) {
6712 Py_INCREF(Py_True);
6713 return Py_True;
6714 }
6715 }
6716 if (left == right)
6717 result = 0;
6718 else
6719 result = unicode_compare((PyUnicodeObject *)left,
6720 (PyUnicodeObject *)right);
6721
6722 /* Convert the return value to a Boolean */
6723 switch (op) {
6724 case Py_EQ:
6725 v = TEST_COND(result == 0);
6726 break;
6727 case Py_NE:
6728 v = TEST_COND(result != 0);
6729 break;
6730 case Py_LE:
6731 v = TEST_COND(result <= 0);
6732 break;
6733 case Py_GE:
6734 v = TEST_COND(result >= 0);
6735 break;
6736 case Py_LT:
6737 v = TEST_COND(result == -1);
6738 break;
6739 case Py_GT:
6740 v = TEST_COND(result == 1);
6741 break;
6742 default:
6743 PyErr_BadArgument();
6744 return NULL;
6745 }
6746 Py_INCREF(v);
6747 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006748 }
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006749
6750 Py_INCREF(Py_NotImplemented);
6751 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006752}
6753
Guido van Rossum403d68b2000-03-13 15:55:09 +00006754int PyUnicode_Contains(PyObject *container,
6755 PyObject *element)
6756{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006757 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006758 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006759
6760 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006761 sub = PyUnicode_FromObject(element);
6762 if (!sub) {
Walter Dörwald26e0f512007-06-12 16:51:31 +00006763 PyErr_Format(PyExc_TypeError,
6764 "'in <string>' requires string as left operand, not %s",
6765 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006766 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006767 }
6768
Thomas Wouters477c8d52006-05-27 19:21:47 +00006769 str = PyUnicode_FromObject(container);
6770 if (!str) {
6771 Py_DECREF(sub);
6772 return -1;
6773 }
6774
6775 result = stringlib_contains_obj(str, sub);
6776
6777 Py_DECREF(str);
6778 Py_DECREF(sub);
6779
Guido van Rossum403d68b2000-03-13 15:55:09 +00006780 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006781}
6782
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783/* Concat to string or Unicode object giving a new Unicode object. */
6784
6785PyObject *PyUnicode_Concat(PyObject *left,
6786 PyObject *right)
6787{
6788 PyUnicodeObject *u = NULL, *v = NULL, *w;
6789
6790 /* Coerce the two arguments */
6791 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6792 if (u == NULL)
6793 goto onError;
6794 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6795 if (v == NULL)
6796 goto onError;
6797
6798 /* Shortcuts */
6799 if (v == unicode_empty) {
6800 Py_DECREF(v);
6801 return (PyObject *)u;
6802 }
6803 if (u == unicode_empty) {
6804 Py_DECREF(u);
6805 return (PyObject *)v;
6806 }
6807
6808 /* Concat the two Unicode strings */
6809 w = _PyUnicode_New(u->length + v->length);
6810 if (w == NULL)
6811 goto onError;
6812 Py_UNICODE_COPY(w->str, u->str, u->length);
6813 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6814
6815 Py_DECREF(u);
6816 Py_DECREF(v);
6817 return (PyObject *)w;
6818
6819onError:
6820 Py_XDECREF(u);
6821 Py_XDECREF(v);
6822 return NULL;
6823}
6824
Walter Dörwald1ab83302007-05-18 17:15:44 +00006825void
6826PyUnicode_Append(PyObject **pleft, PyObject *right)
6827{
6828 PyObject *new;
6829 if (*pleft == NULL)
6830 return;
6831 if (right == NULL || !PyUnicode_Check(*pleft)) {
6832 Py_DECREF(*pleft);
6833 *pleft = NULL;
6834 return;
6835 }
6836 new = PyUnicode_Concat(*pleft, right);
6837 Py_DECREF(*pleft);
6838 *pleft = new;
6839}
6840
6841void
6842PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6843{
6844 PyUnicode_Append(pleft, right);
6845 Py_XDECREF(right);
6846}
6847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006848PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849"S.count(sub[, start[, end]]) -> int\n\
6850\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00006851Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006852string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006853interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
6855static PyObject *
6856unicode_count(PyUnicodeObject *self, PyObject *args)
6857{
6858 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006859 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006860 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 PyObject *result;
6862
Guido van Rossumb8872e62000-05-09 14:14:27 +00006863 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6864 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 return NULL;
6866
6867 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006868 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 if (substring == NULL)
6870 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006871
Thomas Wouters477c8d52006-05-27 19:21:47 +00006872 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
Christian Heimes217cfd12007-12-02 14:31:20 +00006874 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00006875 stringlib_count(self->str + start, end - start,
6876 substring->str, substring->length)
6877 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
6879 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 return result;
6882}
6883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006884PyDoc_STRVAR(encode__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006885"S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00006887Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006888to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006889handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6891'xmlcharrefreplace' as well as any other name registered with\n\
6892codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
6894static PyObject *
6895unicode_encode(PyUnicodeObject *self, PyObject *args)
6896{
6897 char *encoding = NULL;
6898 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006899 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00006900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6902 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00006903 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006904 if (v == NULL)
6905 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00006906 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006907 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00006908 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006909 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00006910 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006911 Py_DECREF(v);
6912 return NULL;
6913 }
6914 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006915
6916 onError:
6917 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006918}
6919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006920PyDoc_STRVAR(expandtabs__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00006921"S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922\n\
6923Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006924If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
6926static PyObject*
6927unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6928{
6929 Py_UNICODE *e;
6930 Py_UNICODE *p;
6931 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006932 Py_UNICODE *qe;
6933 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 PyUnicodeObject *u;
6935 int tabsize = 8;
6936
6937 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6938 return NULL;
6939
Thomas Wouters7e474022000-07-16 12:04:32 +00006940 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006941 i = 0; /* chars up to and including most recent \n or \r */
6942 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6943 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 for (p = self->str; p < e; p++)
6945 if (*p == '\t') {
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006946 if (tabsize > 0) {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006947 incr = tabsize - (j % tabsize); /* cannot overflow */
6948 if (j > PY_SSIZE_T_MAX - incr)
6949 goto overflow1;
6950 j += incr;
6951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 }
6953 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006954 if (j > PY_SSIZE_T_MAX - 1)
6955 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 j++;
6957 if (*p == '\n' || *p == '\r') {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006958 if (i > PY_SSIZE_T_MAX - j)
6959 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006961 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
6963 }
6964
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006965 if (i > PY_SSIZE_T_MAX - j)
6966 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00006967
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 /* Second pass: create output string and fill it */
6969 u = _PyUnicode_New(i + j);
6970 if (!u)
6971 return NULL;
6972
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006973 j = 0; /* same as in first pass */
6974 q = u->str; /* next output char */
6975 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976
6977 for (p = self->str; p < e; p++)
6978 if (*p == '\t') {
6979 if (tabsize > 0) {
6980 i = tabsize - (j % tabsize);
6981 j += i;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006982 while (i--) {
6983 if (q >= qe)
6984 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987 }
6988 }
6989 else {
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006990 if (q >= qe)
6991 goto overflow2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006993 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 if (*p == '\n' || *p == '\r')
6995 j = 0;
6996 }
6997
6998 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00006999
7000 overflow2:
7001 Py_DECREF(u);
7002 overflow1:
7003 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005}
7006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007007PyDoc_STRVAR(find__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007008"S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009\n\
7010Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007011such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012arguments start and end are interpreted as in slice notation.\n\
7013\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007014Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
7016static PyObject *
7017unicode_find(PyUnicodeObject *self, PyObject *args)
7018{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007019 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007020 Py_ssize_t start;
7021 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007022 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
Christian Heimes9cd17752007-11-18 19:35:23 +00007024 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026
Thomas Wouters477c8d52006-05-27 19:21:47 +00007027 result = stringlib_find_slice(
7028 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7029 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7030 start, end
7031 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032
7033 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007034
Christian Heimes217cfd12007-12-02 14:31:20 +00007035 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036}
7037
7038static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007039unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040{
7041 if (index < 0 || index >= self->length) {
7042 PyErr_SetString(PyExc_IndexError, "string index out of range");
7043 return NULL;
7044 }
7045
7046 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7047}
7048
Guido van Rossumc2504932007-09-18 19:42:40 +00007049/* Believe it or not, this produces the same value for ASCII strings
7050 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007052unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053{
Guido van Rossumc2504932007-09-18 19:42:40 +00007054 Py_ssize_t len;
7055 Py_UNICODE *p;
7056 long x;
7057
7058 if (self->hash != -1)
7059 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007060 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007061 p = self->str;
7062 x = *p << 7;
7063 while (--len >= 0)
7064 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007065 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007066 if (x == -1)
7067 x = -2;
7068 self->hash = x;
7069 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070}
7071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007072PyDoc_STRVAR(index__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007073"S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007075Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
7077static PyObject *
7078unicode_index(PyUnicodeObject *self, PyObject *args)
7079{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007080 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007081 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007082 Py_ssize_t start;
7083 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084
Christian Heimes9cd17752007-11-18 19:35:23 +00007085 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087
Thomas Wouters477c8d52006-05-27 19:21:47 +00007088 result = stringlib_find_slice(
7089 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7090 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7091 start, end
7092 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093
7094 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007095
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 if (result < 0) {
7097 PyErr_SetString(PyExc_ValueError, "substring not found");
7098 return NULL;
7099 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007100
Christian Heimes217cfd12007-12-02 14:31:20 +00007101 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102}
7103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007105"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007107Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007108at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109
7110static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007111unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112{
7113 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7114 register const Py_UNICODE *e;
7115 int cased;
7116
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 /* Shortcut for single character strings */
7118 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007119 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007121 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007122 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007123 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007124
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 e = p + PyUnicode_GET_SIZE(self);
7126 cased = 0;
7127 for (; p < e; p++) {
7128 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007129
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007131 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 else if (!cased && Py_UNICODE_ISLOWER(ch))
7133 cased = 1;
7134 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007135 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136}
7137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007138PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007139"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007141Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007142at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
7144static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007145unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146{
7147 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7148 register const Py_UNICODE *e;
7149 int cased;
7150
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 /* Shortcut for single character strings */
7152 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007153 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007155 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007156 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007157 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 e = p + PyUnicode_GET_SIZE(self);
7160 cased = 0;
7161 for (; p < e; p++) {
7162 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007163
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007165 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 else if (!cased && Py_UNICODE_ISUPPER(ch))
7167 cased = 1;
7168 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007169 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170}
7171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007172PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007173"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007175Return True if S is a titlecased string and there is at least one\n\
7176character in S, i.e. upper- and titlecase characters may only\n\
7177follow uncased characters and lowercase characters only cased ones.\n\
7178Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179
7180static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007181unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182{
7183 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7184 register const Py_UNICODE *e;
7185 int cased, previous_is_cased;
7186
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 /* Shortcut for single character strings */
7188 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007189 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7190 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007192 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007193 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007194 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007195
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 e = p + PyUnicode_GET_SIZE(self);
7197 cased = 0;
7198 previous_is_cased = 0;
7199 for (; p < e; p++) {
7200 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007201
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7203 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007204 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 previous_is_cased = 1;
7206 cased = 1;
7207 }
7208 else if (Py_UNICODE_ISLOWER(ch)) {
7209 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007210 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 previous_is_cased = 1;
7212 cased = 1;
7213 }
7214 else
7215 previous_is_cased = 0;
7216 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007217 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218}
7219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007220PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007221"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007223Return True if all characters in S are whitespace\n\
7224and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225
7226static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007227unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228{
7229 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7230 register const Py_UNICODE *e;
7231
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 /* Shortcut for single character strings */
7233 if (PyUnicode_GET_SIZE(self) == 1 &&
7234 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007235 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007237 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007238 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007239 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007240
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 e = p + PyUnicode_GET_SIZE(self);
7242 for (; p < e; p++) {
7243 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007244 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007246 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247}
7248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007249PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007250"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007251\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007252Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007253and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007254
7255static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007256unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007257{
7258 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7259 register const Py_UNICODE *e;
7260
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007261 /* Shortcut for single character strings */
7262 if (PyUnicode_GET_SIZE(self) == 1 &&
7263 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007264 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007265
7266 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007267 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007268 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007269
7270 e = p + PyUnicode_GET_SIZE(self);
7271 for (; p < e; p++) {
7272 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007273 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007274 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007275 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007276}
7277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007279"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007280\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007281Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007282and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007283
7284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007285unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007286{
7287 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7288 register const Py_UNICODE *e;
7289
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007290 /* Shortcut for single character strings */
7291 if (PyUnicode_GET_SIZE(self) == 1 &&
7292 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007293 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007294
7295 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007296 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007297 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007298
7299 e = p + PyUnicode_GET_SIZE(self);
7300 for (; p < e; p++) {
7301 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007302 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007303 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007304 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007305}
7306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007307PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007308"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007310Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007311False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
7313static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007314unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315{
7316 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7317 register const Py_UNICODE *e;
7318
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 /* Shortcut for single character strings */
7320 if (PyUnicode_GET_SIZE(self) == 1 &&
7321 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007322 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007324 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007325 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007326 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007327
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 e = p + PyUnicode_GET_SIZE(self);
7329 for (; p < e; p++) {
7330 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007331 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007333 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334}
7335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007336PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007337"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007339Return True if all characters in S are digits\n\
7340and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341
7342static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007343unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344{
7345 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7346 register const Py_UNICODE *e;
7347
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 /* Shortcut for single character strings */
7349 if (PyUnicode_GET_SIZE(self) == 1 &&
7350 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007351 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007353 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007354 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007355 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007356
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 e = p + PyUnicode_GET_SIZE(self);
7358 for (; p < e; p++) {
7359 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007360 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007362 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363}
7364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007365PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00007366"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007368Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007369False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
7371static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007372unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373{
7374 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7375 register const Py_UNICODE *e;
7376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 /* Shortcut for single character strings */
7378 if (PyUnicode_GET_SIZE(self) == 1 &&
7379 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007380 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007382 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007383 if (PyUnicode_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00007384 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007385
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 e = p + PyUnicode_GET_SIZE(self);
7387 for (; p < e; p++) {
7388 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00007389 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007391 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392}
7393
Martin v. Löwis47383402007-08-15 07:32:56 +00007394int
7395PyUnicode_IsIdentifier(PyObject *self)
7396{
7397 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7398 register const Py_UNICODE *e;
7399
7400 /* Special case for empty strings */
7401 if (PyUnicode_GET_SIZE(self) == 0)
7402 return 0;
7403
7404 /* PEP 3131 says that the first character must be in
7405 XID_Start and subsequent characters in XID_Continue,
7406 and for the ASCII range, the 2.x rules apply (i.e
7407 start with letters and underscore, continue with
7408 letters, digits, underscore). However, given the current
7409 definition of XID_Start and XID_Continue, it is sufficient
7410 to check just for these, except that _ must be allowed
7411 as starting an identifier. */
7412 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7413 return 0;
7414
7415 e = p + PyUnicode_GET_SIZE(self);
7416 for (p++; p < e; p++) {
7417 if (!_PyUnicode_IsXidContinue(*p))
7418 return 0;
7419 }
7420 return 1;
7421}
7422
7423PyDoc_STRVAR(isidentifier__doc__,
7424"S.isidentifier() -> bool\n\
7425\n\
7426Return True if S is a valid identifier according\n\
7427to the language definition.");
7428
7429static PyObject*
7430unicode_isidentifier(PyObject *self)
7431{
7432 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7433}
7434
Georg Brandl559e5d72008-06-11 18:37:52 +00007435PyDoc_STRVAR(isprintable__doc__,
7436"S.isprintable() -> bool\n\
7437\n\
7438Return True if all characters in S are considered\n\
7439printable in repr() or S is empty, False otherwise.");
7440
7441static PyObject*
7442unicode_isprintable(PyObject *self)
7443{
7444 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7445 register const Py_UNICODE *e;
7446
7447 /* Shortcut for single character strings */
7448 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7449 Py_RETURN_TRUE;
7450 }
7451
7452 e = p + PyUnicode_GET_SIZE(self);
7453 for (; p < e; p++) {
7454 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7455 Py_RETURN_FALSE;
7456 }
7457 }
7458 Py_RETURN_TRUE;
7459}
7460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007461PyDoc_STRVAR(join__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007462"S.join(sequence) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463\n\
7464Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007465sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
7467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007468unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007470 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471}
7472
Martin v. Löwis18e16552006-02-15 17:27:45 +00007473static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474unicode_length(PyUnicodeObject *self)
7475{
7476 return self->length;
7477}
7478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007479PyDoc_STRVAR(ljust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007480"S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007482Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007483done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484
7485static PyObject *
7486unicode_ljust(PyUnicodeObject *self, PyObject *args)
7487{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007488 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007489 Py_UNICODE fillchar = ' ';
7490
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007491 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 return NULL;
7493
Tim Peters7a29bd52001-09-12 03:03:31 +00007494 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 Py_INCREF(self);
7496 return (PyObject*) self;
7497 }
7498
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007499 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500}
7501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007502PyDoc_STRVAR(lower__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007503"S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007505Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
7507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007508unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 return fixup(self, fixlower);
7511}
7512
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007513#define LEFTSTRIP 0
7514#define RIGHTSTRIP 1
7515#define BOTHSTRIP 2
7516
7517/* Arrays indexed by above */
7518static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7519
7520#define STRIPNAME(i) (stripformat[i]+3)
7521
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007522/* externally visible for str.strip(unicode) */
7523PyObject *
7524_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7525{
7526 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007527 Py_ssize_t len = PyUnicode_GET_SIZE(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007528 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007529 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7530 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007531
Thomas Wouters477c8d52006-05-27 19:21:47 +00007532 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7533
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007534 i = 0;
7535 if (striptype != RIGHTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007536 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7537 i++;
7538 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007539 }
7540
7541 j = len;
7542 if (striptype != LEFTSTRIP) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007543 do {
7544 j--;
7545 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7546 j++;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007547 }
7548
7549 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007550 Py_INCREF(self);
7551 return (PyObject*)self;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007552 }
7553 else
Thomas Wouters477c8d52006-05-27 19:21:47 +00007554 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007555}
7556
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
7558static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007559do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007561 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
Martin v. Löwis18e16552006-02-15 17:27:45 +00007562 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007563
7564 i = 0;
7565 if (striptype != RIGHTSTRIP) {
7566 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7567 i++;
7568 }
7569 }
7570
7571 j = len;
7572 if (striptype != LEFTSTRIP) {
7573 do {
7574 j--;
7575 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7576 j++;
7577 }
7578
7579 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7580 Py_INCREF(self);
7581 return (PyObject*)self;
7582 }
7583 else
7584 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585}
7586
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007587
7588static PyObject *
7589do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7590{
7591 PyObject *sep = NULL;
7592
7593 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7594 return NULL;
7595
7596 if (sep != NULL && sep != Py_None) {
7597 if (PyUnicode_Check(sep))
7598 return _PyUnicode_XStrip(self, striptype, sep);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007599 else {
7600 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00007601 "%s arg must be None or str",
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007602 STRIPNAME(striptype));
7603 return NULL;
7604 }
7605 }
7606
7607 return do_strip(self, striptype);
7608}
7609
7610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007611PyDoc_STRVAR(strip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007612"S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007613\n\
7614Return a copy of the string S with leading and trailing\n\
7615whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007616If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007617
7618static PyObject *
7619unicode_strip(PyUnicodeObject *self, PyObject *args)
7620{
7621 if (PyTuple_GET_SIZE(args) == 0)
7622 return do_strip(self, BOTHSTRIP); /* Common case */
7623 else
7624 return do_argstrip(self, BOTHSTRIP, args);
7625}
7626
7627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007628PyDoc_STRVAR(lstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007629"S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007630\n\
7631Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007632If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007633
7634static PyObject *
7635unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7636{
7637 if (PyTuple_GET_SIZE(args) == 0)
7638 return do_strip(self, LEFTSTRIP); /* Common case */
7639 else
7640 return do_argstrip(self, LEFTSTRIP, args);
7641}
7642
7643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644PyDoc_STRVAR(rstrip__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007645"S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007646\n\
7647Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007648If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007649
7650static PyObject *
7651unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7652{
7653 if (PyTuple_GET_SIZE(args) == 0)
7654 return do_strip(self, RIGHTSTRIP); /* Common case */
7655 else
7656 return do_argstrip(self, RIGHTSTRIP, args);
7657}
7658
7659
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007661unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662{
7663 PyUnicodeObject *u;
7664 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007665 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007666 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
7668 if (len < 0)
7669 len = 0;
7670
Tim Peters7a29bd52001-09-12 03:03:31 +00007671 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 /* no repeat, return original string */
7673 Py_INCREF(str);
7674 return (PyObject*) str;
7675 }
Tim Peters8f422462000-09-09 06:13:41 +00007676
7677 /* ensure # of chars needed doesn't overflow int and # of bytes
7678 * needed doesn't overflow size_t
7679 */
7680 nchars = len * str->length;
7681 if (len && nchars / len != str->length) {
7682 PyErr_SetString(PyExc_OverflowError,
7683 "repeated string is too long");
7684 return NULL;
7685 }
7686 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7687 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7688 PyErr_SetString(PyExc_OverflowError,
7689 "repeated string is too long");
7690 return NULL;
7691 }
7692 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 if (!u)
7694 return NULL;
7695
7696 p = u->str;
7697
Thomas Wouters477c8d52006-05-27 19:21:47 +00007698 if (str->length == 1 && len > 0) {
7699 Py_UNICODE_FILL(p, str->str[0], len);
7700 } else {
7701 Py_ssize_t done = 0; /* number of characters copied this far */
7702 if (done < nchars) {
7703 Py_UNICODE_COPY(p, str->str, str->length);
7704 done = str->length;
7705 }
7706 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007707 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007708 Py_UNICODE_COPY(p+done, p, n);
7709 done += n;
7710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 }
7712
7713 return (PyObject*) u;
7714}
7715
7716PyObject *PyUnicode_Replace(PyObject *obj,
7717 PyObject *subobj,
7718 PyObject *replobj,
Martin v. Löwis18e16552006-02-15 17:27:45 +00007719 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
7721 PyObject *self;
7722 PyObject *str1;
7723 PyObject *str2;
7724 PyObject *result;
7725
7726 self = PyUnicode_FromObject(obj);
7727 if (self == NULL)
7728 return NULL;
7729 str1 = PyUnicode_FromObject(subobj);
7730 if (str1 == NULL) {
7731 Py_DECREF(self);
7732 return NULL;
7733 }
7734 str2 = PyUnicode_FromObject(replobj);
7735 if (str2 == NULL) {
7736 Py_DECREF(self);
7737 Py_DECREF(str1);
7738 return NULL;
7739 }
Tim Petersced69f82003-09-16 20:30:58 +00007740 result = replace((PyUnicodeObject *)self,
7741 (PyUnicodeObject *)str1,
7742 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 maxcount);
7744 Py_DECREF(self);
7745 Py_DECREF(str1);
7746 Py_DECREF(str2);
7747 return result;
7748}
7749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750PyDoc_STRVAR(replace__doc__,
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007751"S.replace (old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752\n\
7753Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007754old replaced by new. If the optional argument count is\n\
7755given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756
7757static PyObject*
7758unicode_replace(PyUnicodeObject *self, PyObject *args)
7759{
7760 PyUnicodeObject *str1;
7761 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007762 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 PyObject *result;
7764
Martin v. Löwis18e16552006-02-15 17:27:45 +00007765 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 return NULL;
7767 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7768 if (str1 == NULL)
7769 return NULL;
7770 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007771 if (str2 == NULL) {
7772 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
7776 result = replace(self, str1, str2, maxcount);
7777
7778 Py_DECREF(str1);
7779 Py_DECREF(str2);
7780 return result;
7781}
7782
7783static
7784PyObject *unicode_repr(PyObject *unicode)
7785{
Walter Dörwald79e913e2007-05-12 11:08:06 +00007786 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007787 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007788 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7789 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7790
7791 /* XXX(nnorwitz): rather than over-allocating, it would be
7792 better to choose a different scheme. Perhaps scan the
7793 first N-chars of the string and allocate based on that size.
7794 */
7795 /* Initial allocation is based on the longest-possible unichr
7796 escape.
7797
7798 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7799 unichr, so in this case it's the longest unichr escape. In
7800 narrow (UTF-16) builds this is five chars per source unichr
7801 since there are two unichrs in the surrogate pair, so in narrow
7802 (UTF-16) builds it's not the longest unichr escape.
7803
7804 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7805 so in the narrow (UTF-16) build case it's the longest unichr
7806 escape.
7807 */
7808
Walter Dörwald1ab83302007-05-18 17:15:44 +00007809 repr = PyUnicode_FromUnicode(NULL,
Walter Dörwald79e913e2007-05-12 11:08:06 +00007810 2 /* quotes */
7811#ifdef Py_UNICODE_WIDE
7812 + 10*size
7813#else
7814 + 6*size
7815#endif
7816 + 1);
7817 if (repr == NULL)
7818 return NULL;
7819
Walter Dörwald1ab83302007-05-18 17:15:44 +00007820 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00007821
7822 /* Add quote */
7823 *p++ = (findchar(s, size, '\'') &&
7824 !findchar(s, size, '"')) ? '"' : '\'';
7825 while (size-- > 0) {
7826 Py_UNICODE ch = *s++;
7827
7828 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007829 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007830 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00007831 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00007832 continue;
7833 }
7834
Georg Brandl559e5d72008-06-11 18:37:52 +00007835 /* Map special whitespace to '\t', \n', '\r' */
7836 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007837 *p++ = '\\';
7838 *p++ = 't';
7839 }
7840 else if (ch == '\n') {
7841 *p++ = '\\';
7842 *p++ = 'n';
7843 }
7844 else if (ch == '\r') {
7845 *p++ = '\\';
7846 *p++ = 'r';
7847 }
7848
7849 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00007850 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00007851 *p++ = '\\';
7852 *p++ = 'x';
7853 *p++ = hexdigits[(ch >> 4) & 0x000F];
7854 *p++ = hexdigits[ch & 0x000F];
7855 }
7856
Georg Brandl559e5d72008-06-11 18:37:52 +00007857 /* Copy ASCII characters as-is */
7858 else if (ch < 0x7F) {
7859 *p++ = ch;
7860 }
7861
7862 /* Non-ASCII characters */
7863 else {
7864 Py_UCS4 ucs = ch;
7865
7866#ifndef Py_UNICODE_WIDE
7867 Py_UNICODE ch2 = 0;
7868 /* Get code point from surrogate pair */
7869 if (size > 0) {
7870 ch2 = *s;
7871 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7872 && ch2 <= 0xDFFF) {
7873 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7874 + 0x00010000;
7875 s++;
7876 size--;
7877 }
7878 }
7879#endif
7880 /* Map Unicode whitespace and control characters
7881 (categories Z* and C* except ASCII space)
7882 */
7883 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7884 /* Map 8-bit characters to '\xhh' */
7885 if (ucs <= 0xff) {
7886 *p++ = '\\';
7887 *p++ = 'x';
7888 *p++ = hexdigits[(ch >> 4) & 0x000F];
7889 *p++ = hexdigits[ch & 0x000F];
7890 }
7891 /* Map 21-bit characters to '\U00xxxxxx' */
7892 else if (ucs >= 0x10000) {
7893 *p++ = '\\';
7894 *p++ = 'U';
7895 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7896 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7897 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7898 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7899 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7900 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7901 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7902 *p++ = hexdigits[ucs & 0x0000000F];
7903 }
7904 /* Map 16-bit characters to '\uxxxx' */
7905 else {
7906 *p++ = '\\';
7907 *p++ = 'u';
7908 *p++ = hexdigits[(ucs >> 12) & 0x000F];
7909 *p++ = hexdigits[(ucs >> 8) & 0x000F];
7910 *p++ = hexdigits[(ucs >> 4) & 0x000F];
7911 *p++ = hexdigits[ucs & 0x000F];
7912 }
7913 }
7914 /* Copy characters as-is */
7915 else {
7916 *p++ = ch;
7917#ifndef Py_UNICODE_WIDE
7918 if (ucs >= 0x10000)
7919 *p++ = ch2;
7920#endif
7921 }
7922 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00007923 }
7924 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00007925 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00007926
7927 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00007928 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00007929 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930}
7931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007932PyDoc_STRVAR(rfind__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007933"S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934\n\
7935Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007936such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937arguments start and end are interpreted as in slice notation.\n\
7938\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007939Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940
7941static PyObject *
7942unicode_rfind(PyUnicodeObject *self, PyObject *args)
7943{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007944 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007945 Py_ssize_t start;
7946 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007947 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948
Christian Heimes9cd17752007-11-18 19:35:23 +00007949 if (!_ParseTupleFinds(args, &substring, &start, &end))
7950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951
Thomas Wouters477c8d52006-05-27 19:21:47 +00007952 result = stringlib_rfind_slice(
7953 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7954 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7955 start, end
7956 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957
7958 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007959
Christian Heimes217cfd12007-12-02 14:31:20 +00007960 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961}
7962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007963PyDoc_STRVAR(rindex__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007964"S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007966Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967
7968static PyObject *
7969unicode_rindex(PyUnicodeObject *self, PyObject *args)
7970{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007971 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007972 Py_ssize_t start;
7973 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007974 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975
Christian Heimes9cd17752007-11-18 19:35:23 +00007976 if (!_ParseTupleFinds(args, &substring, &start, &end))
7977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
Thomas Wouters477c8d52006-05-27 19:21:47 +00007979 result = stringlib_rfind_slice(
7980 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7981 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7982 start, end
7983 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984
7985 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007986
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 if (result < 0) {
7988 PyErr_SetString(PyExc_ValueError, "substring not found");
7989 return NULL;
7990 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007991 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992}
7993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007994PyDoc_STRVAR(rjust__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00007995"S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007997Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007998done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999
8000static PyObject *
8001unicode_rjust(PyUnicodeObject *self, PyObject *args)
8002{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008003 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008004 Py_UNICODE fillchar = ' ';
8005
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008006 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 return NULL;
8008
Tim Peters7a29bd52001-09-12 03:03:31 +00008009 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 Py_INCREF(self);
8011 return (PyObject*) self;
8012 }
8013
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008014 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015}
8016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017PyObject *PyUnicode_Split(PyObject *s,
8018 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020{
8021 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008022
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 s = PyUnicode_FromObject(s);
8024 if (s == NULL)
8025 return NULL;
8026 if (sep != NULL) {
8027 sep = PyUnicode_FromObject(sep);
8028 if (sep == NULL) {
8029 Py_DECREF(s);
8030 return NULL;
8031 }
8032 }
8033
8034 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8035
8036 Py_DECREF(s);
8037 Py_XDECREF(sep);
8038 return result;
8039}
8040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008041PyDoc_STRVAR(split__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008042"S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043\n\
8044Return a list of the words in S, using sep as the\n\
8045delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008046splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008047whitespace string is a separator and empty strings are\n\
8048removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049
8050static PyObject*
8051unicode_split(PyUnicodeObject *self, PyObject *args)
8052{
8053 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008054 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055
Martin v. Löwis18e16552006-02-15 17:27:45 +00008056 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 return NULL;
8058
8059 if (substring == Py_None)
8060 return split(self, NULL, maxcount);
8061 else if (PyUnicode_Check(substring))
8062 return split(self, (PyUnicodeObject *)substring, maxcount);
8063 else
8064 return PyUnicode_Split((PyObject *)self, substring, maxcount);
8065}
8066
Thomas Wouters477c8d52006-05-27 19:21:47 +00008067PyObject *
8068PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8069{
8070 PyObject* str_obj;
8071 PyObject* sep_obj;
8072 PyObject* out;
8073
8074 str_obj = PyUnicode_FromObject(str_in);
8075 if (!str_obj)
8076 return NULL;
8077 sep_obj = PyUnicode_FromObject(sep_in);
8078 if (!sep_obj) {
8079 Py_DECREF(str_obj);
8080 return NULL;
8081 }
8082
8083 out = stringlib_partition(
8084 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8085 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8086 );
8087
8088 Py_DECREF(sep_obj);
8089 Py_DECREF(str_obj);
8090
8091 return out;
8092}
8093
8094
8095PyObject *
8096PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8097{
8098 PyObject* str_obj;
8099 PyObject* sep_obj;
8100 PyObject* out;
8101
8102 str_obj = PyUnicode_FromObject(str_in);
8103 if (!str_obj)
8104 return NULL;
8105 sep_obj = PyUnicode_FromObject(sep_in);
8106 if (!sep_obj) {
8107 Py_DECREF(str_obj);
8108 return NULL;
8109 }
8110
8111 out = stringlib_rpartition(
8112 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8113 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8114 );
8115
8116 Py_DECREF(sep_obj);
8117 Py_DECREF(str_obj);
8118
8119 return out;
8120}
8121
8122PyDoc_STRVAR(partition__doc__,
8123"S.partition(sep) -> (head, sep, tail)\n\
8124\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008125Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008126the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008127found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008128
8129static PyObject*
8130unicode_partition(PyUnicodeObject *self, PyObject *separator)
8131{
8132 return PyUnicode_Partition((PyObject *)self, separator);
8133}
8134
8135PyDoc_STRVAR(rpartition__doc__,
Thomas Wouters89f507f2006-12-13 04:49:30 +00008136"S.rpartition(sep) -> (tail, sep, head)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008137\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008138Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008139the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008140separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008141
8142static PyObject*
8143unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8144{
8145 return PyUnicode_RPartition((PyObject *)self, separator);
8146}
8147
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008148PyObject *PyUnicode_RSplit(PyObject *s,
8149 PyObject *sep,
Martin v. Löwis18e16552006-02-15 17:27:45 +00008150 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008151{
8152 PyObject *result;
8153
8154 s = PyUnicode_FromObject(s);
8155 if (s == NULL)
8156 return NULL;
8157 if (sep != NULL) {
8158 sep = PyUnicode_FromObject(sep);
8159 if (sep == NULL) {
8160 Py_DECREF(s);
8161 return NULL;
8162 }
8163 }
8164
8165 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8166
8167 Py_DECREF(s);
8168 Py_XDECREF(sep);
8169 return result;
8170}
8171
8172PyDoc_STRVAR(rsplit__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008173"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008174\n\
8175Return a list of the words in S, using sep as the\n\
8176delimiter string, starting at the end of the string and\n\
8177working to the front. If maxsplit is given, at most maxsplit\n\
8178splits are done. If sep is not specified, any whitespace string\n\
8179is a separator.");
8180
8181static PyObject*
8182unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8183{
8184 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008185 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008186
Martin v. Löwis18e16552006-02-15 17:27:45 +00008187 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008188 return NULL;
8189
8190 if (substring == Py_None)
8191 return rsplit(self, NULL, maxcount);
8192 else if (PyUnicode_Check(substring))
8193 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8194 else
8195 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8196}
8197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008198PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson4469d0c2008-11-30 22:46:23 +00008199"S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200\n\
8201Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008202Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008203is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204
8205static PyObject*
8206unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8207{
Guido van Rossum86662912000-04-11 15:38:46 +00008208 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209
Guido van Rossum86662912000-04-11 15:38:46 +00008210 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 return NULL;
8212
Guido van Rossum86662912000-04-11 15:38:46 +00008213 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214}
8215
8216static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008217PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218{
Walter Dörwald346737f2007-05-31 10:44:43 +00008219 if (PyUnicode_CheckExact(self)) {
8220 Py_INCREF(self);
8221 return self;
8222 } else
8223 /* Subtype -- return genuine unicode string with the same value. */
8224 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8225 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226}
8227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008228PyDoc_STRVAR(swapcase__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008229"S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230\n\
8231Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008232and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233
8234static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008235unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 return fixup(self, fixswapcase);
8238}
8239
Georg Brandlceee0772007-11-27 23:48:05 +00008240PyDoc_STRVAR(maketrans__doc__,
8241"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8242\n\
8243Return a translation table usable for str.translate().\n\
8244If there is only one argument, it must be a dictionary mapping Unicode\n\
8245ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008246Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008247If there are two arguments, they must be strings of equal length, and\n\
8248in the resulting dictionary, each character in x will be mapped to the\n\
8249character at the same position in y. If there is a third argument, it\n\
8250must be a string, whose characters will be mapped to None in the result.");
8251
8252static PyObject*
8253unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8254{
8255 PyObject *x, *y = NULL, *z = NULL;
8256 PyObject *new = NULL, *key, *value;
8257 Py_ssize_t i = 0;
8258 int res;
8259
8260 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8261 return NULL;
8262 new = PyDict_New();
8263 if (!new)
8264 return NULL;
8265 if (y != NULL) {
8266 /* x must be a string too, of equal length */
8267 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8268 if (!PyUnicode_Check(x)) {
8269 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8270 "be a string if there is a second argument");
8271 goto err;
8272 }
8273 if (PyUnicode_GET_SIZE(x) != ylen) {
8274 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8275 "arguments must have equal length");
8276 goto err;
8277 }
8278 /* create entries for translating chars in x to those in y */
8279 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008280 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8281 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008282 if (!key || !value)
8283 goto err;
8284 res = PyDict_SetItem(new, key, value);
8285 Py_DECREF(key);
8286 Py_DECREF(value);
8287 if (res < 0)
8288 goto err;
8289 }
8290 /* create entries for deleting chars in z */
8291 if (z != NULL) {
8292 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008293 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008294 if (!key)
8295 goto err;
8296 res = PyDict_SetItem(new, key, Py_None);
8297 Py_DECREF(key);
8298 if (res < 0)
8299 goto err;
8300 }
8301 }
8302 } else {
8303 /* x must be a dict */
8304 if (!PyDict_Check(x)) {
8305 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8306 "to maketrans it must be a dict");
8307 goto err;
8308 }
8309 /* copy entries into the new dict, converting string keys to int keys */
8310 while (PyDict_Next(x, &i, &key, &value)) {
8311 if (PyUnicode_Check(key)) {
8312 /* convert string keys to integer keys */
8313 PyObject *newkey;
8314 if (PyUnicode_GET_SIZE(key) != 1) {
8315 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8316 "table must be of length 1");
8317 goto err;
8318 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008319 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008320 if (!newkey)
8321 goto err;
8322 res = PyDict_SetItem(new, newkey, value);
8323 Py_DECREF(newkey);
8324 if (res < 0)
8325 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008326 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008327 /* just keep integer keys */
8328 if (PyDict_SetItem(new, key, value) < 0)
8329 goto err;
8330 } else {
8331 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8332 "be strings or integers");
8333 goto err;
8334 }
8335 }
8336 }
8337 return new;
8338 err:
8339 Py_DECREF(new);
8340 return NULL;
8341}
8342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008343PyDoc_STRVAR(translate__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008344"S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345\n\
8346Return a copy of the string S, where all characters have been mapped\n\
8347through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008348Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008349Unmapped characters are left untouched. Characters mapped to None\n\
8350are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351
8352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008353unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354{
Georg Brandlceee0772007-11-27 23:48:05 +00008355 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008358PyDoc_STRVAR(upper__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008359"S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008361Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362
8363static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008364unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 return fixup(self, fixupper);
8367}
8368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008369PyDoc_STRVAR(zfill__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008370"S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008372Pad a numeric string S with zeros on the left, to fill a field\n\
8373of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374
8375static PyObject *
8376unicode_zfill(PyUnicodeObject *self, PyObject *args)
8377{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008378 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 PyUnicodeObject *u;
8380
Martin v. Löwis18e16552006-02-15 17:27:45 +00008381 Py_ssize_t width;
8382 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 return NULL;
8384
8385 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008386 if (PyUnicode_CheckExact(self)) {
8387 Py_INCREF(self);
8388 return (PyObject*) self;
8389 }
8390 else
8391 return PyUnicode_FromUnicode(
8392 PyUnicode_AS_UNICODE(self),
8393 PyUnicode_GET_SIZE(self)
8394 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
8396
8397 fill = width - self->length;
8398
8399 u = pad(self, fill, 0, '0');
8400
Walter Dörwald068325e2002-04-15 13:36:47 +00008401 if (u == NULL)
8402 return NULL;
8403
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 if (u->str[fill] == '+' || u->str[fill] == '-') {
8405 /* move sign to beginning of string */
8406 u->str[0] = u->str[fill];
8407 u->str[fill] = '0';
8408 }
8409
8410 return (PyObject*) u;
8411}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412
8413#if 0
8414static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008415unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416{
Christian Heimes2202f872008-02-06 14:31:34 +00008417 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418}
8419#endif
8420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008421PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008422"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008424Return True if S starts with the specified prefix, False otherwise.\n\
8425With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008426With optional end, stop comparing S at that position.\n\
8427prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428
8429static PyObject *
8430unicode_startswith(PyUnicodeObject *self,
8431 PyObject *args)
8432{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008433 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008435 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008436 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008437 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008439 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Guido van Rossumb8872e62000-05-09 14:14:27 +00008440 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008442 if (PyTuple_Check(subobj)) {
8443 Py_ssize_t i;
8444 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8445 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8446 PyTuple_GET_ITEM(subobj, i));
8447 if (substring == NULL)
8448 return NULL;
8449 result = tailmatch(self, substring, start, end, -1);
8450 Py_DECREF(substring);
8451 if (result) {
8452 Py_RETURN_TRUE;
8453 }
8454 }
8455 /* nothing matched */
8456 Py_RETURN_FALSE;
8457 }
8458 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008460 return NULL;
8461 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008463 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464}
8465
8466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008467PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00008468"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008470Return True if S ends with the specified suffix, False otherwise.\n\
8471With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008472With optional end, stop comparing S at that position.\n\
8473suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474
8475static PyObject *
8476unicode_endswith(PyUnicodeObject *self,
8477 PyObject *args)
8478{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008479 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008481 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008482 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008483 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008485 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8486 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008488 if (PyTuple_Check(subobj)) {
8489 Py_ssize_t i;
8490 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8491 substring = (PyUnicodeObject *)PyUnicode_FromObject(
8492 PyTuple_GET_ITEM(subobj, i));
8493 if (substring == NULL)
8494 return NULL;
8495 result = tailmatch(self, substring, start, end, +1);
8496 Py_DECREF(substring);
8497 if (result) {
8498 Py_RETURN_TRUE;
8499 }
8500 }
8501 Py_RETURN_FALSE;
8502 }
8503 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 if (substring == NULL)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008507 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008509 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510}
8511
Eric Smith8c663262007-08-25 02:26:07 +00008512#include "stringlib/string_format.h"
8513
8514PyDoc_STRVAR(format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008515"S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008516\n\
8517");
8518
Eric Smith4a7d76d2008-05-30 18:10:19 +00008519static PyObject *
8520unicode__format__(PyObject* self, PyObject* args)
8521{
8522 PyObject *format_spec;
8523
8524 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8525 return NULL;
8526
8527 return _PyUnicode_FormatAdvanced(self,
8528 PyUnicode_AS_UNICODE(format_spec),
8529 PyUnicode_GET_SIZE(format_spec));
8530}
8531
Eric Smith8c663262007-08-25 02:26:07 +00008532PyDoc_STRVAR(p_format__doc__,
Georg Brandl17cb8a82008-05-30 08:20:09 +00008533"S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008534\n\
8535");
8536
8537static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008538unicode__sizeof__(PyUnicodeObject *v)
8539{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008540 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8541 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008542}
8543
8544PyDoc_STRVAR(sizeof__doc__,
8545"S.__sizeof__() -> size of S in memory, in bytes");
8546
8547static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008548unicode_getnewargs(PyUnicodeObject *v)
8549{
8550 return Py_BuildValue("(u#)", v->str, v->length);
8551}
8552
8553
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554static PyMethodDef unicode_methods[] = {
8555
8556 /* Order is according to common usage: often used methods should
8557 appear first, since lookup is done sequentially. */
8558
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008559 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8560 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8561 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008562 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008563 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8564 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8565 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8566 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8567 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8568 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8569 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008570 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008571 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8572 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8573 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008574 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008575 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8576 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8577 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008578 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008579 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008580 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008581 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008582 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8583 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8584 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8585 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8586 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8587 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8588 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8589 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8590 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8591 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8592 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8593 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8594 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8595 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008596 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008597 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008598 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008599 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008600 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008601 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8602 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008603 {"maketrans", (PyCFunction) unicode_maketrans,
8604 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008605 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008606#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008607 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608#endif
8609
8610#if 0
8611 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008612 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613#endif
8614
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008615 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 {NULL, NULL}
8617};
8618
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008619static PyObject *
8620unicode_mod(PyObject *v, PyObject *w)
8621{
8622 if (!PyUnicode_Check(v)) {
8623 Py_INCREF(Py_NotImplemented);
8624 return Py_NotImplemented;
8625 }
8626 return PyUnicode_Format(v, w);
8627}
8628
8629static PyNumberMethods unicode_as_number = {
8630 0, /*nb_add*/
8631 0, /*nb_subtract*/
8632 0, /*nb_multiply*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008633 unicode_mod, /*nb_remainder*/
8634};
8635
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636static PySequenceMethods unicode_as_sequence = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008637 (lenfunc) unicode_length, /* sq_length */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008638 PyUnicode_Concat, /* sq_concat */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008639 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8640 (ssizeargfunc) unicode_getitem, /* sq_item */
Thomas Woutersd2cf20e2007-08-30 22:57:53 +00008641 0, /* sq_slice */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 0, /* sq_ass_item */
8643 0, /* sq_ass_slice */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008644 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645};
8646
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008647static PyObject*
8648unicode_subscript(PyUnicodeObject* self, PyObject* item)
8649{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008650 if (PyIndex_Check(item)) {
8651 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008652 if (i == -1 && PyErr_Occurred())
8653 return NULL;
8654 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008655 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008656 return unicode_getitem(self, i);
8657 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008658 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008659 Py_UNICODE* source_buf;
8660 Py_UNICODE* result_buf;
8661 PyObject* result;
8662
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008663 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008664 &start, &stop, &step, &slicelength) < 0) {
8665 return NULL;
8666 }
8667
8668 if (slicelength <= 0) {
8669 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008670 } else if (start == 0 && step == 1 && slicelength == self->length &&
8671 PyUnicode_CheckExact(self)) {
8672 Py_INCREF(self);
8673 return (PyObject *)self;
8674 } else if (step == 1) {
8675 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008676 } else {
8677 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008678 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8679 sizeof(Py_UNICODE));
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008680
8681 if (result_buf == NULL)
8682 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008683
8684 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8685 result_buf[i] = source_buf[cur];
8686 }
Tim Petersced69f82003-09-16 20:30:58 +00008687
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008688 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008689 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008690 return result;
8691 }
8692 } else {
8693 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8694 return NULL;
8695 }
8696}
8697
8698static PyMappingMethods unicode_as_mapping = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 (lenfunc)unicode_length, /* mp_length */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008700 (binaryfunc)unicode_subscript, /* mp_subscript */
8701 (objobjargproc)0, /* mp_ass_subscript */
8702};
8703
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705/* Helpers for PyUnicode_Format() */
8706
8707static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008708getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008710 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 if (argidx < arglen) {
8712 (*p_argidx)++;
8713 if (arglen < 0)
8714 return args;
8715 else
8716 return PyTuple_GetItem(args, argidx);
8717 }
8718 PyErr_SetString(PyExc_TypeError,
8719 "not enough arguments for format string");
8720 return NULL;
8721}
8722
Martin v. Löwis18e16552006-02-15 17:27:45 +00008723static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008724strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008726 register Py_ssize_t i;
8727 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 for (i = len - 1; i >= 0; i--)
8729 buffer[i] = (Py_UNICODE) charbuffer[i];
8730
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 return len;
8732}
8733
Neal Norwitzfc76d632006-01-10 06:03:13 +00008734static int
8735doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8736{
Tim Peters15231542006-02-16 01:08:01 +00008737 Py_ssize_t result;
8738
Neal Norwitzfc76d632006-01-10 06:03:13 +00008739 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008740 result = strtounicode(buffer, (char *)buffer);
8741 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008742}
8743
Christian Heimes3fd13992008-03-21 01:05:49 +00008744#if 0
Neal Norwitzfc76d632006-01-10 06:03:13 +00008745static int
8746longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8747{
Tim Peters15231542006-02-16 01:08:01 +00008748 Py_ssize_t result;
8749
Neal Norwitzfc76d632006-01-10 06:03:13 +00008750 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008751 result = strtounicode(buffer, (char *)buffer);
8752 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008753}
Christian Heimes3fd13992008-03-21 01:05:49 +00008754#endif
Neal Norwitzfc76d632006-01-10 06:03:13 +00008755
Guido van Rossum078151d2002-08-11 04:24:12 +00008756/* XXX To save some code duplication, formatfloat/long/int could have been
8757 shared with stringobject.c, converting from 8-bit to Unicode after the
8758 formatting is done. */
8759
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760static int
8761formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008762 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 int flags,
8764 int prec,
8765 int type,
8766 PyObject *v)
8767{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008768 /* fmt = '%#.' + `prec` + `type`
8769 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 char fmt[20];
8771 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008772
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 x = PyFloat_AsDouble(v);
8774 if (x == -1.0 && PyErr_Occurred())
8775 return -1;
8776 if (prec < 0)
8777 prec = 6;
Eric Smith22b85b32008-07-17 19:18:29 +00008778 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8779 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008780 /* Worst case length calc to ensure no buffer overrun:
8781
8782 'g' formats:
8783 fmt = %#.<prec>g
8784 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8785 for any double rep.)
8786 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8787
8788 'f' formats:
8789 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8790 len = 1 + 50 + 1 + prec = 52 + prec
8791
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008792 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008793 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008794
8795 */
Guido van Rossumb5a755e2007-07-18 18:15:48 +00008796 if (((type == 'g' || type == 'G') &&
8797 buflen <= (size_t)10 + (size_t)prec) ||
Eric Smith22b85b32008-07-17 19:18:29 +00008798 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008799 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008800 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008801 return -1;
8802 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008803 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8804 (flags&F_ALT) ? "#" : "",
8805 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008806 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807}
8808
Tim Peters38fd5b62000-09-21 05:43:11 +00008809static PyObject*
8810formatlong(PyObject *val, int flags, int prec, int type)
8811{
8812 char *buf;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008813 int len;
Tim Peters38fd5b62000-09-21 05:43:11 +00008814 PyObject *str; /* temporary string object. */
Walter Dörwald63a28be2007-06-20 15:11:12 +00008815 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008816
Christian Heimes72b710a2008-05-26 13:28:38 +00008817 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008818 if (!str)
8819 return NULL;
Walter Dörwald63a28be2007-06-20 15:11:12 +00008820 result = PyUnicode_FromStringAndSize(buf, len);
Tim Peters38fd5b62000-09-21 05:43:11 +00008821 Py_DECREF(str);
Walter Dörwald63a28be2007-06-20 15:11:12 +00008822 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008823}
8824
Christian Heimes3fd13992008-03-21 01:05:49 +00008825#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826static int
8827formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008828 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 int flags,
8830 int prec,
8831 int type,
8832 PyObject *v)
8833{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008834 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008835 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8836 * + 1 + 1
8837 * = 24
8838 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008839 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008840 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841 long x;
8842
Christian Heimes217cfd12007-12-02 14:31:20 +00008843 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008845 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008846 if (x < 0 && type == 'u') {
8847 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008848 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008849 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8850 sign = "-";
8851 else
8852 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008854 prec = 1;
8855
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008856 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8857 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008858 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008859 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008860 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008861 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008862 return -1;
8863 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008864
8865 if ((flags & F_ALT) &&
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008866 (type == 'x' || type == 'X' || type == 'o')) {
8867 /* When converting under %#o, %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008868 * of issues that cause pain:
Guido van Rossumcd16bf62007-06-13 18:07:49 +00008869 * - for %#o, we want a different base marker than C
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008870 * - when 0 is being converted, the C standard leaves off
8871 * the '0x' or '0X', which is inconsistent with other
8872 * %#x/%#X conversions and inconsistent with Python's
8873 * hex() function
8874 * - there are platforms that violate the standard and
8875 * convert 0 with the '0x' or '0X'
8876 * (Metrowerks, Compaq Tru64)
8877 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008878 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008879 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008880 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008881 * We can achieve the desired consistency by inserting our
8882 * own '0x' or '0X' prefix, and substituting %x/%X in place
8883 * of %#x/%#X.
8884 *
8885 * Note that this is the same approach as used in
8886 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008887 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008888 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8889 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008890 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008891 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008892 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8893 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008894 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008895 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008896 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008897 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008898 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008899 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900}
Christian Heimes3fd13992008-03-21 01:05:49 +00008901#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902
8903static int
8904formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008905 size_t buflen,
8906 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008908 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008909 if (PyUnicode_Check(v)) {
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008910 if (PyUnicode_GET_SIZE(v) == 1) {
8911 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8912 buf[1] = '\0';
8913 return 1;
8914 }
8915#ifndef Py_UNICODE_WIDE
8916 if (PyUnicode_GET_SIZE(v) == 2) {
8917 /* Decode a valid surrogate pair */
8918 int c0 = PyUnicode_AS_UNICODE(v)[0];
8919 int c1 = PyUnicode_AS_UNICODE(v)[1];
8920 if (0xD800 <= c0 && c0 <= 0xDBFF &&
8921 0xDC00 <= c1 && c1 <= 0xDFFF) {
8922 buf[0] = c0;
8923 buf[1] = c1;
8924 buf[2] = '\0';
8925 return 2;
8926 }
8927 }
8928#endif
8929 goto onError;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 else {
8932 /* Integer input truncated to a character */
8933 long x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008934 x = PyLong_AsLong(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008936 goto onError;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008937
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008938 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00008939 PyErr_SetString(PyExc_OverflowError,
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008940 "%c arg not in range(0x110000)");
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008941 return -1;
8942 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008943
8944#ifndef Py_UNICODE_WIDE
8945 if (x > 0xffff) {
8946 x -= 0x10000;
8947 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8948 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8949 return 2;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008950 }
8951#endif
8952 buf[0] = (Py_UNICODE) x;
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00008953 buf[1] = '\0';
8954 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008956
8957 onError:
8958 PyErr_SetString(PyExc_TypeError,
8959 "%c requires int or char");
8960 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961}
8962
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008963/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8964
8965 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8966 chars are formatted. XXX This is a magic number. Each formatting
8967 routine does bounds checking to ensure no overflow, but a better
8968 solution may be to malloc a buffer of appropriate size for each
8969 format. For now, the current solution is sufficient.
8970*/
8971#define FORMATBUFLEN (size_t)120
8972
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973PyObject *PyUnicode_Format(PyObject *format,
8974 PyObject *args)
8975{
8976 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008977 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 int args_owned = 0;
8979 PyUnicodeObject *result = NULL;
8980 PyObject *dict = NULL;
8981 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008982
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 if (format == NULL || args == NULL) {
8984 PyErr_BadInternalCall();
8985 return NULL;
8986 }
8987 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008988 if (uformat == NULL)
8989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 fmt = PyUnicode_AS_UNICODE(uformat);
8991 fmtcnt = PyUnicode_GET_SIZE(uformat);
8992
8993 reslen = rescnt = fmtcnt + 100;
8994 result = _PyUnicode_New(reslen);
8995 if (result == NULL)
8996 goto onError;
8997 res = PyUnicode_AS_UNICODE(result);
8998
8999 if (PyTuple_Check(args)) {
9000 arglen = PyTuple_Size(args);
9001 argidx = 0;
9002 }
9003 else {
9004 arglen = -1;
9005 argidx = -2;
9006 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009007 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009008 !PyUnicode_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 dict = args;
9010
9011 while (--fmtcnt >= 0) {
9012 if (*fmt != '%') {
9013 if (--rescnt < 0) {
9014 rescnt = fmtcnt + 100;
9015 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009016 if (_PyUnicode_Resize(&result, reslen) < 0)
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9019 --rescnt;
9020 }
9021 *res++ = *fmt++;
9022 }
9023 else {
9024 /* Got a format specifier */
9025 int flags = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009026 Py_ssize_t width = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 Py_UNICODE c = '\0';
9029 Py_UNICODE fill;
Christian Heimesa612dc02008-02-24 13:08:18 +00009030 int isnumok;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 PyObject *v = NULL;
9032 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009033 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 Py_UNICODE sign;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009035 Py_ssize_t len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009036 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037
9038 fmt++;
9039 if (*fmt == '(') {
9040 Py_UNICODE *keystart;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009041 Py_ssize_t keylen;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 PyObject *key;
9043 int pcount = 1;
9044
9045 if (dict == NULL) {
9046 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00009047 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 goto onError;
9049 }
9050 ++fmt;
9051 --fmtcnt;
9052 keystart = fmt;
9053 /* Skip over balanced parentheses */
9054 while (pcount > 0 && --fmtcnt >= 0) {
9055 if (*fmt == ')')
9056 --pcount;
9057 else if (*fmt == '(')
9058 ++pcount;
9059 fmt++;
9060 }
9061 keylen = fmt - keystart - 1;
9062 if (fmtcnt < 0 || pcount > 0) {
9063 PyErr_SetString(PyExc_ValueError,
9064 "incomplete format key");
9065 goto onError;
9066 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00009067#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00009068 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 then looked up since Python uses strings to hold
9070 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00009071 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072 key = PyUnicode_EncodeUTF8(keystart,
9073 keylen,
9074 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00009075#else
9076 key = PyUnicode_FromUnicode(keystart, keylen);
9077#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078 if (key == NULL)
9079 goto onError;
9080 if (args_owned) {
9081 Py_DECREF(args);
9082 args_owned = 0;
9083 }
9084 args = PyObject_GetItem(dict, key);
9085 Py_DECREF(key);
9086 if (args == NULL) {
9087 goto onError;
9088 }
9089 args_owned = 1;
9090 arglen = -1;
9091 argidx = -2;
9092 }
9093 while (--fmtcnt >= 0) {
9094 switch (c = *fmt++) {
9095 case '-': flags |= F_LJUST; continue;
9096 case '+': flags |= F_SIGN; continue;
9097 case ' ': flags |= F_BLANK; continue;
9098 case '#': flags |= F_ALT; continue;
9099 case '0': flags |= F_ZERO; continue;
9100 }
9101 break;
9102 }
9103 if (c == '*') {
9104 v = getnextarg(args, arglen, &argidx);
9105 if (v == NULL)
9106 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00009107 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 PyErr_SetString(PyExc_TypeError,
9109 "* wants int");
9110 goto onError;
9111 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009112 width = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00009113 if (width == -1 && PyErr_Occurred())
9114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115 if (width < 0) {
9116 flags |= F_LJUST;
9117 width = -width;
9118 }
9119 if (--fmtcnt >= 0)
9120 c = *fmt++;
9121 }
9122 else if (c >= '0' && c <= '9') {
9123 width = c - '0';
9124 while (--fmtcnt >= 0) {
9125 c = *fmt++;
9126 if (c < '0' || c > '9')
9127 break;
9128 if ((width*10) / 10 != width) {
9129 PyErr_SetString(PyExc_ValueError,
9130 "width too big");
9131 goto onError;
9132 }
9133 width = width*10 + (c - '0');
9134 }
9135 }
9136 if (c == '.') {
9137 prec = 0;
9138 if (--fmtcnt >= 0)
9139 c = *fmt++;
9140 if (c == '*') {
9141 v = getnextarg(args, arglen, &argidx);
9142 if (v == NULL)
9143 goto onError;
Christian Heimes217cfd12007-12-02 14:31:20 +00009144 if (!PyLong_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 PyErr_SetString(PyExc_TypeError,
9146 "* wants int");
9147 goto onError;
9148 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009149 prec = PyLong_AsLong(v);
Guido van Rossumddefaf32007-01-14 03:31:43 +00009150 if (prec == -1 && PyErr_Occurred())
9151 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 if (prec < 0)
9153 prec = 0;
9154 if (--fmtcnt >= 0)
9155 c = *fmt++;
9156 }
9157 else if (c >= '0' && c <= '9') {
9158 prec = c - '0';
9159 while (--fmtcnt >= 0) {
9160 c = Py_CHARMASK(*fmt++);
9161 if (c < '0' || c > '9')
9162 break;
9163 if ((prec*10) / 10 != prec) {
9164 PyErr_SetString(PyExc_ValueError,
9165 "prec too big");
9166 goto onError;
9167 }
9168 prec = prec*10 + (c - '0');
9169 }
9170 }
9171 } /* prec */
9172 if (fmtcnt >= 0) {
9173 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 if (--fmtcnt >= 0)
9175 c = *fmt++;
9176 }
9177 }
9178 if (fmtcnt < 0) {
9179 PyErr_SetString(PyExc_ValueError,
9180 "incomplete format");
9181 goto onError;
9182 }
9183 if (c != '%') {
9184 v = getnextarg(args, arglen, &argidx);
9185 if (v == NULL)
9186 goto onError;
9187 }
9188 sign = 0;
9189 fill = ' ';
9190 switch (c) {
9191
9192 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009193 pbuf = formatbuf;
9194 /* presume that buffer length is at least 1 */
9195 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 len = 1;
9197 break;
9198
9199 case 's':
9200 case 'r':
Georg Brandl559e5d72008-06-11 18:37:52 +00009201 case 'a':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 if (PyUnicode_Check(v) && c == 's') {
9203 temp = v;
9204 Py_INCREF(temp);
9205 }
9206 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207 if (c == 's')
Thomas Heller519a0422007-11-15 20:48:54 +00009208 temp = PyObject_Str(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009209 else if (c == 'r')
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 temp = PyObject_Repr(v);
Georg Brandl559e5d72008-06-11 18:37:52 +00009211 else
9212 temp = PyObject_ASCII(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 if (temp == NULL)
9214 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009215 if (PyUnicode_Check(temp))
9216 /* nothing to do */;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00009217 else {
9218 Py_DECREF(temp);
9219 PyErr_SetString(PyExc_TypeError,
9220 "%s argument has non-string str()");
9221 goto onError;
9222 }
9223 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009224 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 len = PyUnicode_GET_SIZE(temp);
9226 if (prec >= 0 && len > prec)
9227 len = prec;
9228 break;
9229
9230 case 'i':
9231 case 'd':
9232 case 'u':
9233 case 'o':
9234 case 'x':
9235 case 'X':
9236 if (c == 'i')
9237 c = 'd';
Christian Heimesa612dc02008-02-24 13:08:18 +00009238 isnumok = 0;
9239 if (PyNumber_Check(v)) {
9240 PyObject *iobj=NULL;
9241
9242 if (PyLong_Check(v)) {
9243 iobj = v;
9244 Py_INCREF(iobj);
9245 }
9246 else {
9247 iobj = PyNumber_Long(v);
9248 }
9249 if (iobj!=NULL) {
9250 if (PyLong_Check(iobj)) {
9251 isnumok = 1;
9252 temp = formatlong(iobj, flags, prec, c);
9253 Py_DECREF(iobj);
9254 if (!temp)
9255 goto onError;
9256 pbuf = PyUnicode_AS_UNICODE(temp);
9257 len = PyUnicode_GET_SIZE(temp);
9258 sign = 1;
9259 }
9260 else {
9261 Py_DECREF(iobj);
9262 }
9263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 }
Christian Heimesa612dc02008-02-24 13:08:18 +00009265 if (!isnumok) {
9266 PyErr_Format(PyExc_TypeError,
9267 "%%%c format: a number is required, "
Martin v. Löwis5a6f4582008-04-07 03:22:07 +00009268 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
Tim Peters38fd5b62000-09-21 05:43:11 +00009269 goto onError;
Tim Peters38fd5b62000-09-21 05:43:11 +00009270 }
9271 if (flags & F_ZERO)
9272 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273 break;
9274
9275 case 'e':
9276 case 'E':
9277 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00009278 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 case 'g':
9280 case 'G':
Eric Smith22b85b32008-07-17 19:18:29 +00009281 if (c == 'F')
9282 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009283 pbuf = formatbuf;
9284 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9285 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 if (len < 0)
9287 goto onError;
9288 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00009289 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 fill = '0';
9291 break;
9292
9293 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009294 pbuf = formatbuf;
9295 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 if (len < 0)
9297 goto onError;
9298 break;
9299
9300 default:
9301 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00009302 "unsupported format character '%c' (0x%x) "
Thomas Wouters89f507f2006-12-13 04:49:30 +00009303 "at index %zd",
Tim Petersced69f82003-09-16 20:30:58 +00009304 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00009305 (int)c,
Thomas Wouters89f507f2006-12-13 04:49:30 +00009306 (Py_ssize_t)(fmt - 1 -
9307 PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 goto onError;
9309 }
9310 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009311 if (*pbuf == '-' || *pbuf == '+') {
9312 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313 len--;
9314 }
9315 else if (flags & F_SIGN)
9316 sign = '+';
9317 else if (flags & F_BLANK)
9318 sign = ' ';
9319 else
9320 sign = 0;
9321 }
9322 if (width < len)
9323 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009324 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325 reslen -= rescnt;
9326 rescnt = width + fmtcnt + 100;
9327 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009328 if (reslen < 0) {
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00009329 Py_XDECREF(temp);
Thomas Woutersa96affe2006-03-12 00:29:36 +00009330 PyErr_NoMemory();
9331 goto onError;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00009332 }
Thomas Woutersa96affe2006-03-12 00:29:36 +00009333 if (_PyUnicode_Resize(&result, reslen) < 0) {
9334 Py_XDECREF(temp);
9335 goto onError;
9336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337 res = PyUnicode_AS_UNICODE(result)
9338 + reslen - rescnt;
9339 }
9340 if (sign) {
9341 if (fill != ' ')
9342 *res++ = sign;
9343 rescnt--;
9344 if (width > len)
9345 width--;
9346 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009347 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009348 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009349 assert(pbuf[1] == c);
9350 if (fill != ' ') {
9351 *res++ = *pbuf++;
9352 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00009353 }
Tim Petersfff53252001-04-12 18:38:48 +00009354 rescnt -= 2;
9355 width -= 2;
9356 if (width < 0)
9357 width = 0;
9358 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00009359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 if (width > len && !(flags & F_LJUST)) {
9361 do {
9362 --rescnt;
9363 *res++ = fill;
9364 } while (--width > len);
9365 }
Tim Peters38fd5b62000-09-21 05:43:11 +00009366 if (fill == ' ') {
9367 if (sign)
9368 *res++ = sign;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00009369 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00009370 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00009371 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00009372 *res++ = *pbuf++;
9373 *res++ = *pbuf++;
9374 }
9375 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009376 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 res += len;
9378 rescnt -= len;
9379 while (--width >= len) {
9380 --rescnt;
9381 *res++ = ' ';
9382 }
9383 if (dict && (argidx < arglen) && c != '%') {
9384 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009385 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009386 Py_XDECREF(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 goto onError;
9388 }
9389 Py_XDECREF(temp);
9390 } /* '%' */
9391 } /* until end */
9392 if (argidx < arglen && !dict) {
9393 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00009394 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 goto onError;
9396 }
9397
Thomas Woutersa96affe2006-03-12 00:29:36 +00009398 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9399 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 if (args_owned) {
9401 Py_DECREF(args);
9402 }
9403 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 return (PyObject *)result;
9405
9406 onError:
9407 Py_XDECREF(result);
9408 Py_DECREF(uformat);
9409 if (args_owned) {
9410 Py_DECREF(args);
9411 }
9412 return NULL;
9413}
9414
Jeremy Hylton938ace62002-07-17 16:30:39 +00009415static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009416unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9417
Tim Peters6d6c1a32001-08-02 04:15:00 +00009418static PyObject *
9419unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9420{
9421 PyObject *x = NULL;
Guido van Rossum55b4a7b2007-07-11 09:28:11 +00009422 static char *kwlist[] = {"object", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00009423 char *encoding = NULL;
9424 char *errors = NULL;
9425
Guido van Rossume023fe02001-08-30 03:12:59 +00009426 if (type != &PyUnicode_Type)
9427 return unicode_subtype_new(type, args, kwds);
Alexandre Vassalotti999679a2008-05-03 04:42:16 +00009428 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Tim Peters6d6c1a32001-08-02 04:15:00 +00009429 kwlist, &x, &encoding, &errors))
9430 return NULL;
9431 if (x == NULL)
9432 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009433 if (encoding == NULL && errors == NULL)
Thomas Heller519a0422007-11-15 20:48:54 +00009434 return PyObject_Str(x);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00009435 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00009436 return PyUnicode_FromEncodedObject(x, encoding, errors);
9437}
9438
Guido van Rossume023fe02001-08-30 03:12:59 +00009439static PyObject *
9440unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9441{
Tim Petersaf90b3e2001-09-12 05:18:58 +00009442 PyUnicodeObject *tmp, *pnew;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009443 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009444
9445 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9446 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9447 if (tmp == NULL)
9448 return NULL;
9449 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009450 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009451 if (pnew == NULL) {
9452 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00009453 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00009454 }
Christian Heimesb186d002008-03-18 15:15:01 +00009455 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
Tim Petersaf90b3e2001-09-12 05:18:58 +00009456 if (pnew->str == NULL) {
9457 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009458 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00009459 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00009460 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00009461 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00009462 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9463 pnew->length = n;
9464 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00009465 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00009466 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009467}
9468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009469PyDoc_STRVAR(unicode_doc,
Georg Brandl17cb8a82008-05-30 08:20:09 +00009470"str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009471\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009472Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009473encoding defaults to the current default string encoding.\n\
9474errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009475
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009476static PyObject *unicode_iter(PyObject *seq);
9477
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009479 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Guido van Rossum84fc66d2007-05-03 17:18:26 +00009480 "str", /* tp_name */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 sizeof(PyUnicodeObject), /* tp_size */
9482 0, /* tp_itemsize */
9483 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00009484 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009486 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 0, /* tp_setattr */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009488 0, /* tp_compare */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009489 unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009490 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009492 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 (hashfunc) unicode_hash, /* tp_hash*/
9494 0, /* tp_call*/
9495 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009496 PyObject_GenericGetAttr, /* tp_getattro */
9497 0, /* tp_setattro */
Alexandre Vassalotti70a23712007-10-14 02:05:51 +00009498 0, /* tp_as_buffer */
Thomas Wouters27d517b2007-02-25 20:39:11 +00009499 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9500 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009501 unicode_doc, /* tp_doc */
9502 0, /* tp_traverse */
9503 0, /* tp_clear */
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009504 PyUnicode_RichCompare, /* tp_richcompare */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009505 0, /* tp_weaklistoffset */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009506 unicode_iter, /* tp_iter */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009507 0, /* tp_iternext */
9508 unicode_methods, /* tp_methods */
9509 0, /* tp_members */
9510 0, /* tp_getset */
Guido van Rossum3172c5d2007-10-16 18:12:55 +00009511 &PyBaseObject_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00009512 0, /* tp_dict */
9513 0, /* tp_descr_get */
9514 0, /* tp_descr_set */
9515 0, /* tp_dictoffset */
9516 0, /* tp_init */
9517 0, /* tp_alloc */
9518 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00009519 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520};
9521
9522/* Initialize the Unicode implementation */
9523
Thomas Wouters78890102000-07-22 19:25:51 +00009524void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009526 int i;
9527
Thomas Wouters477c8d52006-05-27 19:21:47 +00009528 /* XXX - move this array to unicodectype.c ? */
9529 Py_UNICODE linebreak[] = {
9530 0x000A, /* LINE FEED */
9531 0x000D, /* CARRIAGE RETURN */
9532 0x001C, /* FILE SEPARATOR */
9533 0x001D, /* GROUP SEPARATOR */
9534 0x001E, /* RECORD SEPARATOR */
9535 0x0085, /* NEXT LINE */
9536 0x2028, /* LINE SEPARATOR */
9537 0x2029, /* PARAGRAPH SEPARATOR */
9538 };
9539
Fred Drakee4315f52000-05-09 19:53:39 +00009540 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009541 free_list = NULL;
9542 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009544 if (!unicode_empty)
9545 return;
9546
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009547 for (i = 0; i < 256; i++)
9548 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009549 if (PyType_Ready(&PyUnicode_Type) < 0)
9550 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009551
9552 /* initialize the linebreak bloom filter */
9553 bloom_linebreak = make_bloom_mask(
9554 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9555 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009556
9557 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558}
9559
9560/* Finalize the Unicode implementation */
9561
Christian Heimesa156e092008-02-16 07:38:31 +00009562int
9563PyUnicode_ClearFreeList(void)
9564{
9565 int freelist_size = numfree;
9566 PyUnicodeObject *u;
9567
9568 for (u = free_list; u != NULL;) {
9569 PyUnicodeObject *v = u;
9570 u = *(PyUnicodeObject **)u;
9571 if (v->str)
Christian Heimesb186d002008-03-18 15:15:01 +00009572 PyObject_DEL(v->str);
Christian Heimesa156e092008-02-16 07:38:31 +00009573 Py_XDECREF(v->defenc);
9574 PyObject_Del(v);
9575 numfree--;
9576 }
9577 free_list = NULL;
9578 assert(numfree == 0);
9579 return freelist_size;
9580}
9581
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582void
Thomas Wouters78890102000-07-22 19:25:51 +00009583_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009585 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009587 Py_XDECREF(unicode_empty);
9588 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009589
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009590 for (i = 0; i < 256; i++) {
9591 if (unicode_latin1[i]) {
9592 Py_DECREF(unicode_latin1[i]);
9593 unicode_latin1[i] = NULL;
9594 }
9595 }
Christian Heimesa156e092008-02-16 07:38:31 +00009596 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009598
Walter Dörwald16807132007-05-25 13:52:07 +00009599void
9600PyUnicode_InternInPlace(PyObject **p)
9601{
9602 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9603 PyObject *t;
9604 if (s == NULL || !PyUnicode_Check(s))
9605 Py_FatalError(
9606 "PyUnicode_InternInPlace: unicode strings only please!");
9607 /* If it's a subclass, we don't really know what putting
9608 it in the interned dict might do. */
9609 if (!PyUnicode_CheckExact(s))
9610 return;
9611 if (PyUnicode_CHECK_INTERNED(s))
9612 return;
9613 if (interned == NULL) {
9614 interned = PyDict_New();
9615 if (interned == NULL) {
9616 PyErr_Clear(); /* Don't leave an exception */
9617 return;
9618 }
9619 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009620 /* It might be that the GetItem call fails even
9621 though the key is present in the dictionary,
9622 namely when this happens during a stack overflow. */
9623 Py_ALLOW_RECURSION
Walter Dörwald16807132007-05-25 13:52:07 +00009624 t = PyDict_GetItem(interned, (PyObject *)s);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009625 Py_END_ALLOW_RECURSION
9626
Walter Dörwald16807132007-05-25 13:52:07 +00009627 if (t) {
9628 Py_INCREF(t);
9629 Py_DECREF(*p);
9630 *p = t;
9631 return;
9632 }
9633
Martin v. Löwis5b222132007-06-10 09:51:05 +00009634 PyThreadState_GET()->recursion_critical = 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009635 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9636 PyErr_Clear();
Martin v. Löwis5b222132007-06-10 09:51:05 +00009637 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009638 return;
9639 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00009640 PyThreadState_GET()->recursion_critical = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009641 /* The two references in interned are not counted by refcnt.
9642 The deallocator will take care of this */
Christian Heimes90aa7642007-12-19 02:45:37 +00009643 Py_REFCNT(s) -= 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009644 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9645}
9646
9647void
9648PyUnicode_InternImmortal(PyObject **p)
9649{
9650 PyUnicode_InternInPlace(p);
9651 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9652 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9653 Py_INCREF(*p);
9654 }
9655}
9656
9657PyObject *
9658PyUnicode_InternFromString(const char *cp)
9659{
9660 PyObject *s = PyUnicode_FromString(cp);
9661 if (s == NULL)
9662 return NULL;
9663 PyUnicode_InternInPlace(&s);
9664 return s;
9665}
9666
9667void _Py_ReleaseInternedUnicodeStrings(void)
9668{
9669 PyObject *keys;
9670 PyUnicodeObject *s;
9671 Py_ssize_t i, n;
9672 Py_ssize_t immortal_size = 0, mortal_size = 0;
9673
9674 if (interned == NULL || !PyDict_Check(interned))
9675 return;
9676 keys = PyDict_Keys(interned);
9677 if (keys == NULL || !PyList_Check(keys)) {
9678 PyErr_Clear();
9679 return;
9680 }
9681
9682 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9683 detector, interned unicode strings are not forcibly deallocated;
9684 rather, we give them their stolen references back, and then clear
9685 and DECREF the interned dict. */
9686
9687 n = PyList_GET_SIZE(keys);
9688 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9689 n);
9690 for (i = 0; i < n; i++) {
9691 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9692 switch (s->state) {
9693 case SSTATE_NOT_INTERNED:
9694 /* XXX Shouldn't happen */
9695 break;
9696 case SSTATE_INTERNED_IMMORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009697 Py_REFCNT(s) += 1;
Walter Dörwald16807132007-05-25 13:52:07 +00009698 immortal_size += s->length;
9699 break;
9700 case SSTATE_INTERNED_MORTAL:
Christian Heimes90aa7642007-12-19 02:45:37 +00009701 Py_REFCNT(s) += 2;
Walter Dörwald16807132007-05-25 13:52:07 +00009702 mortal_size += s->length;
9703 break;
9704 default:
9705 Py_FatalError("Inconsistent interned string state.");
9706 }
9707 s->state = SSTATE_NOT_INTERNED;
9708 }
9709 fprintf(stderr, "total size of all interned strings: "
9710 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9711 "mortal/immortal\n", mortal_size, immortal_size);
9712 Py_DECREF(keys);
9713 PyDict_Clear(interned);
9714 Py_DECREF(interned);
9715 interned = NULL;
9716}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009717
9718
9719/********************* Unicode Iterator **************************/
9720
9721typedef struct {
9722 PyObject_HEAD
Guido van Rossum49d6b072006-08-17 21:11:47 +00009723 Py_ssize_t it_index;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009724 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9725} unicodeiterobject;
9726
9727static void
9728unicodeiter_dealloc(unicodeiterobject *it)
9729{
9730 _PyObject_GC_UNTRACK(it);
9731 Py_XDECREF(it->it_seq);
9732 PyObject_GC_Del(it);
9733}
9734
9735static int
9736unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9737{
9738 Py_VISIT(it->it_seq);
9739 return 0;
9740}
9741
9742static PyObject *
9743unicodeiter_next(unicodeiterobject *it)
9744{
9745 PyUnicodeObject *seq;
9746 PyObject *item;
9747
9748 assert(it != NULL);
9749 seq = it->it_seq;
9750 if (seq == NULL)
9751 return NULL;
9752 assert(PyUnicode_Check(seq));
9753
9754 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009755 item = PyUnicode_FromUnicode(
9756 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009757 if (item != NULL)
9758 ++it->it_index;
9759 return item;
9760 }
9761
9762 Py_DECREF(seq);
9763 it->it_seq = NULL;
9764 return NULL;
9765}
9766
9767static PyObject *
9768unicodeiter_len(unicodeiterobject *it)
9769{
9770 Py_ssize_t len = 0;
9771 if (it->it_seq)
9772 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
Christian Heimes217cfd12007-12-02 14:31:20 +00009773 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009774}
9775
9776PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9777
9778static PyMethodDef unicodeiter_methods[] = {
Guido van Rossum49d6b072006-08-17 21:11:47 +00009779 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9780 length_hint_doc},
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009781 {NULL, NULL} /* sentinel */
9782};
9783
9784PyTypeObject PyUnicodeIter_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009785 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Christian Heimesf83be4e2007-11-28 09:44:38 +00009786 "str_iterator", /* tp_name */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009787 sizeof(unicodeiterobject), /* tp_basicsize */
9788 0, /* tp_itemsize */
9789 /* methods */
Guido van Rossum49d6b072006-08-17 21:11:47 +00009790 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009791 0, /* tp_print */
9792 0, /* tp_getattr */
9793 0, /* tp_setattr */
9794 0, /* tp_compare */
9795 0, /* tp_repr */
9796 0, /* tp_as_number */
9797 0, /* tp_as_sequence */
9798 0, /* tp_as_mapping */
9799 0, /* tp_hash */
9800 0, /* tp_call */
9801 0, /* tp_str */
9802 PyObject_GenericGetAttr, /* tp_getattro */
9803 0, /* tp_setattro */
9804 0, /* tp_as_buffer */
9805 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9806 0, /* tp_doc */
9807 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9808 0, /* tp_clear */
9809 0, /* tp_richcompare */
9810 0, /* tp_weaklistoffset */
9811 PyObject_SelfIter, /* tp_iter */
9812 (iternextfunc)unicodeiter_next, /* tp_iternext */
9813 unicodeiter_methods, /* tp_methods */
9814 0,
9815};
9816
9817static PyObject *
9818unicode_iter(PyObject *seq)
9819{
9820 unicodeiterobject *it;
9821
9822 if (!PyUnicode_Check(seq)) {
9823 PyErr_BadInternalCall();
9824 return NULL;
9825 }
9826 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9827 if (it == NULL)
9828 return NULL;
9829 it->it_index = 0;
9830 Py_INCREF(seq);
9831 it->it_seq = (PyUnicodeObject *)seq;
9832 _PyObject_GC_TRACK(it);
9833 return (PyObject *)it;
9834}
9835
Martin v. Löwis5b222132007-06-10 09:51:05 +00009836size_t
9837Py_UNICODE_strlen(const Py_UNICODE *u)
9838{
9839 int res = 0;
9840 while(*u++)
9841 res++;
9842 return res;
9843}
9844
9845Py_UNICODE*
9846Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9847{
9848 Py_UNICODE *u = s1;
9849 while ((*u++ = *s2++));
9850 return s1;
9851}
9852
9853Py_UNICODE*
9854Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9855{
9856 Py_UNICODE *u = s1;
9857 while ((*u++ = *s2++))
9858 if (n-- == 0)
9859 break;
9860 return s1;
9861}
9862
9863int
9864Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9865{
9866 while (*s1 && *s2 && *s1 == *s2)
9867 s1++, s2++;
9868 if (*s1 && *s2)
9869 return (*s1 < *s2) ? -1 : +1;
9870 if (*s1)
9871 return 1;
9872 if (*s2)
9873 return -1;
9874 return 0;
9875}
9876
9877Py_UNICODE*
9878Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9879{
9880 const Py_UNICODE *p;
9881 for (p = s; *p; p++)
9882 if (*p == c)
9883 return (Py_UNICODE*)p;
9884 return NULL;
9885}
9886
9887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009888#ifdef __cplusplus
9889}
9890#endif
9891
9892
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009893/*
9894Local variables:
9895c-basic-offset: 4
9896indent-tabs-mode: nil
9897End:
9898*/