Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: c8c07a613b1ce28bc0f9dcaccfd7ece3f0f43cf3 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	401	const char *s = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	405	int reclevel;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	406
				407	if (obj == NULL) {
				408	PyErr_BadInternalCall();
				409	return NULL;
				410	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	411
				412	/* Coerce object */
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	413	for (reclevel = 0; reclevel < 2; reclevel++) {
				414
				415	if (PyUnicode_Check(obj)) {
				416	if (encoding) {
				417	PyErr_SetString(PyExc_TypeError,
				418	"decoding Unicode is not supported");
				419	goto onError;
				420	}
				421	if (PyUnicode_CheckExact(obj)) {
				422	Py_INCREF(obj);
				423	v = obj;
				424	}
				425	else {
				426	/* For a subclass of unicode, return a true unicode object
				427	with the same string value. */
				428	v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				429	PyUnicode_GET_SIZE(obj));
				430	}
				431	goto done;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	432	}
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	433	else if (PyString_Check(obj)) {
				434	s = PyString_AS_STRING(obj);
				435	len = PyString_GET_SIZE(obj);
				436	break;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	437	}
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	438	else {
				439	PyObject *w;
				440
				441	/* Try char buffer interface */
				442	if (PyObject_AsCharBuffer(obj, &s, &len))
				443	PyErr_Clear();
				444	else
				445	break;
				446
				447	/* Mimic the behaviour of str(object) if everything else
				448	fails (see PyObject_Str()); this also covers instances
				449	which implement __str__. */
				450	if (obj->ob_type->tp_str == NULL)
				451	w = PyObject_Repr(obj);
				452	else
				453	w = (*obj->ob_type->tp_str)(obj);
				454	if (w == NULL)
				455	goto onError;
				456	if (owned) {
				457	Py_DECREF(obj);
				458	}
				459	obj = w;
				460	owned = 1;
Tim Peters	78e0fc7	2001-09-11 03:07:38 +0000	[diff] [blame]	461	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	462	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	463
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	464	if (s == NULL) {
				465	PyErr_Format(PyExc_TypeError,
				466	"coercing to Unicode: __str__ recursion limit exceeded "
				467	"(last type: %.80s)",
				468	obj->ob_type->tp_name);
				469	goto onError;
				470	}
				471
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	472	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	473	if (len == 0) {
				474	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	475	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	477	else
				478	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	479
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	480	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	481	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	482	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	483	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	484	return v;
				485
				486	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	487	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	488	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	489	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	490	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	}
				492
				493	PyObject PyUnicode_Decode(const char s,
				494	int size,
				495	const char *encoding,
				496	const char *errors)
				497	{
				498	PyObject buffer = NULL, unicode;
				499
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	500	if (encoding == NULL)
				501	encoding = PyUnicode_GetDefaultEncoding();
				502
				503	/* Shortcuts for common default encodings */
				504	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	505	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	506	else if (strcmp(encoding, "latin-1") == 0)
				507	return PyUnicode_DecodeLatin1(s, size, errors);
				508	else if (strcmp(encoding, "ascii") == 0)
				509	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	510
				511	/* Decode via the codec registry */
				512	buffer = PyBuffer_FromMemory((void *)s, size);
				513	if (buffer == NULL)
				514	goto onError;
				515	unicode = PyCodec_Decode(buffer, encoding, errors);
				516	if (unicode == NULL)
				517	goto onError;
				518	if (!PyUnicode_Check(unicode)) {
				519	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	520	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	521	unicode->ob_type->tp_name);
				522	Py_DECREF(unicode);
				523	goto onError;
				524	}
				525	Py_DECREF(buffer);
				526	return unicode;
				527
				528	onError:
				529	Py_XDECREF(buffer);
				530	return NULL;
				531	}
				532
				533	PyObject PyUnicode_Encode(const Py_UNICODE s,
				534	int size,
				535	const char *encoding,
				536	const char *errors)
				537	{
				538	PyObject v, unicode;
				539
				540	unicode = PyUnicode_FromUnicode(s, size);
				541	if (unicode == NULL)
				542	return NULL;
				543	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				544	Py_DECREF(unicode);
				545	return v;
				546	}
				547
				548	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				549	const char *encoding,
				550	const char *errors)
				551	{
				552	PyObject *v;
				553
				554	if (!PyUnicode_Check(unicode)) {
				555	PyErr_BadArgument();
				556	goto onError;
				557	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	558
				559	if (encoding == NULL)
				560	encoding = PyUnicode_GetDefaultEncoding();
				561
				562	/* Shortcuts for common default encodings */
				563	if (errors == NULL) {
				564	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	565	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	566	else if (strcmp(encoding, "latin-1") == 0)
				567	return PyUnicode_AsLatin1String(unicode);
				568	else if (strcmp(encoding, "ascii") == 0)
				569	return PyUnicode_AsASCIIString(unicode);
				570	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	571
				572	/* Encode via the codec registry */
				573	v = PyCodec_Encode(unicode, encoding, errors);
				574	if (v == NULL)
				575	goto onError;
				576	/* XXX Should we really enforce this ? */
				577	if (!PyString_Check(v)) {
				578	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	579	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	580	v->ob_type->tp_name);
				581	Py_DECREF(v);
				582	goto onError;
				583	}
				584	return v;
				585
				586	onError:
				587	return NULL;
				588	}
				589
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	590	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				591	const char *errors)
				592	{
				593	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				594
				595	if (v)
				596	return v;
				597	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				598	if (v && errors == NULL)
				599	((PyUnicodeObject *)unicode)->defenc = v;
				600	return v;
				601	}
				602
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	603	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				604	{
				605	if (!PyUnicode_Check(unicode)) {
				606	PyErr_BadArgument();
				607	goto onError;
				608	}
				609	return PyUnicode_AS_UNICODE(unicode);
				610
				611	onError:
				612	return NULL;
				613	}
				614
				615	int PyUnicode_GetSize(PyObject *unicode)
				616	{
				617	if (!PyUnicode_Check(unicode)) {
				618	PyErr_BadArgument();
				619	goto onError;
				620	}
				621	return PyUnicode_GET_SIZE(unicode);
				622
				623	onError:
				624	return -1;
				625	}
				626
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	627	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	628	{
				629	return unicode_default_encoding;
				630	}
				631
				632	int PyUnicode_SetDefaultEncoding(const char *encoding)
				633	{
				634	PyObject *v;
				635
				636	/* Make sure the encoding is valid. As side effect, this also
				637	loads the encoding into the codec registry cache. */
				638	v = _PyCodec_Lookup(encoding);
				639	if (v == NULL)
				640	goto onError;
				641	Py_DECREF(v);
				642	strncpy(unicode_default_encoding,
				643	encoding,
				644	sizeof(unicode_default_encoding));
				645	return 0;
				646
				647	onError:
				648	return -1;
				649	}
				650
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame]	651	/* --- UTF-7 Codec -------------------------------------------------------- */
				652
				653	/* see RFC2152 for details */
				654
				655	static
				656	char utf7_special[128] = {
				657	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				658	encoded:
				659	0 - not special
				660	1 - special
				661	2 - whitespace (optional)
				662	3 - RFC2152 Set O (optional) */
				663	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				664	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				665	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				666	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				667	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				668	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				669	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				670	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				671
				672	};
				673
				674	#define SPECIAL(c, encodeO, encodeWS) \
				675	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				676	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				677	(encodeO && (utf7_special[(c)] == 3)))
				678
				679	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				680	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				681	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				682	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				683
				684	#define ENCODE(out, ch, bits) \
				685	while (bits >= 6) { \
				686	*out++ = B64(ch >> (bits-6)); \
				687	bits -= 6; \
				688	}
				689
				690	#define DECODE(out, ch, bits, surrogate) \
				691	while (bits >= 16) { \
				692	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				693	bits -= 16; \
				694	if (surrogate) { \
				695	/* We have already generated an error for the high surrogate
				696	so let's not bother seeing if the low surrogate is correct or not */\
				697	surrogate = 0; \
				698	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				699	/* This is a surrogate pair. Unfortunately we can't represent \
				700	it in a 16-bit character */ \
				701	surrogate = 1; \
				702	errmsg = "code pairs are not supported"; \
				703	goto utf7Error; \
				704	} else { \
				705	*out++ = outCh; \
				706	} \
				707	} \
				708
				709	static
				710	int utf7_decoding_error(Py_UNICODE **dest,
				711	const char *errors,
				712	const char *details)
				713	{
				714	if ((errors == NULL) \|\|
				715	(strcmp(errors,"strict") == 0)) {
				716	PyErr_Format(PyExc_UnicodeError,
				717	"UTF-7 decoding error: %.400s",
				718	details);
				719	return -1;
				720	}
				721	else if (strcmp(errors,"ignore") == 0) {
				722	return 0;
				723	}
				724	else if (strcmp(errors,"replace") == 0) {
				725	if (dest != NULL) {
				726	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				727	(*dest)++;
				728	}
				729	return 0;
				730	}
				731	else {
				732	PyErr_Format(PyExc_ValueError,
				733	"UTF-7 decoding error; unknown error handling code: %.400s",
				734	errors);
				735	return -1;
				736	}
				737	}
				738
				739	PyObject PyUnicode_DecodeUTF7(const char s,
				740	int size,
				741	const char *errors)
				742	{
				743	const char *e;
				744	PyUnicodeObject *unicode;
				745	Py_UNICODE *p;
				746	const char *errmsg = "";
				747	int inShift = 0;
				748	unsigned int bitsleft = 0;
				749	unsigned long charsleft = 0;
				750	int surrogate = 0;
				751
				752	unicode = _PyUnicode_New(size);
				753	if (!unicode)
				754	return NULL;
				755	if (size == 0)
				756	return (PyObject *)unicode;
				757
				758	p = unicode->str;
				759	e = s + size;
				760
				761	while (s < e) {
				762	Py_UNICODE ch = *s;
				763
				764	if (inShift) {
				765	if ((ch == '-') \|\| !B64CHAR(ch)) {
				766	inShift = 0;
				767	s++;
				768
				769	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				770	if (bitsleft >= 6) {
				771	/* The shift sequence has a partial character in it. If
				772	bitsleft < 6 then we could just classify it as padding
				773	but that is not the case here */
				774
				775	errmsg = "partial character in shift sequence";
				776	goto utf7Error;
				777	}
				778	/* According to RFC2152 the remaining bits should be zero. We
				779	choose to signal an error/insert a replacement character
				780	here so indicate the potential of a misencoded character. */
				781
				782	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				783	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				784	errmsg = "non-zero padding bits in shift sequence";
				785	goto utf7Error;
				786	}
				787
				788	if (ch == '-') {
				789	if ((s < e) && (*(s) == '-')) {
				790	*p++ = '-';
				791	inShift = 1;
				792	}
				793	} else if (SPECIAL(ch,0,0)) {
				794	errmsg = "unexpected special character";
				795	goto utf7Error;
				796	} else {
				797	*p++ = ch;
				798	}
				799	} else {
				800	charsleft = (charsleft << 6) \| UB64(ch);
				801	bitsleft += 6;
				802	s++;
				803	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				804	}
				805	}
				806	else if ( ch == '+' ) {
				807	s++;
				808	if (s < e && *s == '-') {
				809	s++;
				810	*p++ = '+';
				811	} else
				812	{
				813	inShift = 1;
				814	bitsleft = 0;
				815	}
				816	}
				817	else if (SPECIAL(ch,0,0)) {
				818	errmsg = "unexpected special character";
				819	s++;
				820	goto utf7Error;
				821	}
				822	else {
				823	*p++ = ch;
				824	s++;
				825	}
				826	continue;
				827	utf7Error:
				828	if (utf7_decoding_error(&p, errors, errmsg))
				829	goto onError;
				830	}
				831
				832	if (inShift) {
				833	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				834	goto onError;
				835	}
				836
				837	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				838	goto onError;
				839
				840	return (PyObject *)unicode;
				841
				842	onError:
				843	Py_DECREF(unicode);
				844	return NULL;
				845	}
				846
				847
				848	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				849	int size,
				850	int encodeSetO,
				851	int encodeWhiteSpace,
				852	const char *errors)
				853	{
				854	PyObject *v;
				855	/* It might be possible to tighten this worst case */
				856	unsigned int cbAllocated = 5 * size;
				857	int inShift = 0;
				858	int i = 0;
				859	unsigned int bitsleft = 0;
				860	unsigned long charsleft = 0;
				861	char * out;
				862	char * start;
				863
				864	if (size == 0)
				865	return PyString_FromStringAndSize(NULL, 0);
				866
				867	v = PyString_FromStringAndSize(NULL, cbAllocated);
				868	if (v == NULL)
				869	return NULL;
				870
				871	start = out = PyString_AS_STRING(v);
				872	for (;i < size; ++i) {
				873	Py_UNICODE ch = s[i];
				874
				875	if (!inShift) {
				876	if (ch == '+') {
				877	*out++ = '+';
				878	*out++ = '-';
				879	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				880	charsleft = ch;
				881	bitsleft = 16;
				882	*out++ = '+';
				883	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				884	inShift = bitsleft > 0;
				885	} else {
				886	*out++ = (char) ch;
				887	}
				888	} else {
				889	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				890	*out++ = B64(charsleft << (6-bitsleft));
				891	charsleft = 0;
				892	bitsleft = 0;
				893	/* Characters not in the BASE64 set implicitly unshift the sequence
				894	so no '-' is required, except if the character is itself a '-' */
				895	if (B64CHAR(ch) \|\| ch == '-') {
				896	*out++ = '-';
				897	}
				898	inShift = 0;
				899	*out++ = (char) ch;
				900	} else {
				901	bitsleft += 16;
				902	charsleft = (charsleft << 16) \| ch;
				903	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				904
				905	/* If the next character is special then we dont' need to terminate
				906	the shift sequence. If the next character is not a BASE64 character
				907	or '-' then the shift sequence will be terminated implicitly and we
				908	don't have to insert a '-'. */
				909
				910	if (bitsleft == 0) {
				911	if (i + 1 < size) {
				912	Py_UNICODE ch2 = s[i+1];
				913
				914	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				915
				916	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				917	*out++ = '-';
				918	inShift = 0;
				919	} else {
				920	inShift = 0;
				921	}
				922
				923	}
				924	else {
				925	*out++ = '-';
				926	inShift = 0;
				927	}
				928	}
				929	}
				930	}
				931	}
				932	if (bitsleft) {
				933	*out++= B64(charsleft << (6-bitsleft) );
				934	*out++ = '-';
				935	}
				936
				937	if (_PyString_Resize(&v, out - start)) {
				938	Py_DECREF(v);
				939	return NULL;
				940	}
				941	return v;
				942	}
				943
				944	#undef SPECIAL
				945	#undef B64
				946	#undef B64CHAR
				947	#undef UB64
				948	#undef ENCODE
				949	#undef DECODE
				950
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951	/* --- UTF-8 Codec -------------------------------------------------------- */
				952
				953	static
				954	char utf8_code_length[256] = {
				955	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				956	illegal prefix. see RFC 2279 for details */
				957	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				958	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				959	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				960	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				961	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				962	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				963	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				964	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				965	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				966	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				967	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				968	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				969	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				970	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				971	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				972	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				973	};
				974
				975	static
				976	int utf8_decoding_error(const char **source,
				977	Py_UNICODE **dest,
				978	const char *errors,
				979	const char *details)
				980	{
				981	if ((errors == NULL) \|\|
				982	(strcmp(errors,"strict") == 0)) {
				983	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	984	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	985	details);
				986	return -1;
				987	}
				988	else if (strcmp(errors,"ignore") == 0) {
				989	(*source)++;
				990	return 0;
				991	}
				992	else if (strcmp(errors,"replace") == 0) {
				993	(*source)++;
				994	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				995	(*dest)++;
				996	return 0;
				997	}
				998	else {
				999	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1000	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1001	errors);
				1002	return -1;
				1003	}
				1004	}
				1005
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1006	PyObject PyUnicode_DecodeUTF8(const char s,
				1007	int size,
				1008	const char *errors)
				1009	{
				1010	int n;
				1011	const char *e;
				1012	PyUnicodeObject *unicode;
				1013	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1014	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1015
				1016	/* Note: size will always be longer than the resulting Unicode
				1017	character count */
				1018	unicode = _PyUnicode_New(size);
				1019	if (!unicode)
				1020	return NULL;
				1021	if (size == 0)
				1022	return (PyObject *)unicode;
				1023
				1024	/* Unpack UTF-8 encoded data */
				1025	p = unicode->str;
				1026	e = s + size;
				1027
				1028	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1029	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1030
				1031	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1032	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1033	s++;
				1034	continue;
				1035	}
				1036
				1037	n = utf8_code_length[ch];
				1038
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1039	if (s + n > e) {
				1040	errmsg = "unexpected end of data";
				1041	goto utf8Error;
				1042	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043
				1044	switch (n) {
				1045
				1046	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1047	errmsg = "unexpected code byte";
				1048	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1049
				1050	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1051	errmsg = "internal error";
				1052	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053
				1054	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1055	if ((s[1] & 0xc0) != 0x80) {
				1056	errmsg = "invalid data";
				1057	goto utf8Error;
				1058	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1059	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1060	if (ch < 0x80) {
				1061	errmsg = "illegal encoding";
				1062	goto utf8Error;
				1063	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1064	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1065	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1066	break;
				1067
				1068	case 3:
				1069	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1070	(s[2] & 0xc0) != 0x80) {
				1071	errmsg = "invalid data";
				1072	goto utf8Error;
				1073	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1074	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1075	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				1076	errmsg = "illegal encoding";
				1077	goto utf8Error;
				1078	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1080	*p++ = (Py_UNICODE)ch;
				1081	break;
				1082
				1083	case 4:
				1084	if ((s[1] & 0xc0) != 0x80 \|\|
				1085	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1086	(s[3] & 0xc0) != 0x80) {
				1087	errmsg = "invalid data";
				1088	goto utf8Error;
				1089	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1090	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1091	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1092	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1093	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1094	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1095	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1096	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1097	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1098	errmsg = "illegal encoding";
				1099	goto utf8Error;
				1100	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1101	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1102	*p++ = (Py_UNICODE)ch;
				1103	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1104	/* compute and append the two surrogates: */
				1105
				1106	/* translate from 10000..10FFFF to 0..FFFF */
				1107	ch -= 0x10000;
				1108
				1109	/* high surrogate = top 10 bits added to D800 */
				1110	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1111
				1112	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1113	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1114	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	break;
				1116
				1117	default:
				1118	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1119	errmsg = "unsupported Unicode code range";
				1120	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1121	}
				1122	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1123	continue;
				1124
				1125	utf8Error:
				1126	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1127	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	}
				1129
				1130	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1131	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	goto onError;
				1133
				1134	return (PyObject *)unicode;
				1135
				1136	onError:
				1137	Py_DECREF(unicode);
				1138	return NULL;
				1139	}
				1140
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1141	/* Not used anymore, now that the encoder supports UTF-16
				1142	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1143	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1144	static
				1145	int utf8_encoding_error(const Py_UNICODE **source,
				1146	char **dest,
				1147	const char *errors,
				1148	const char *details)
				1149	{
				1150	if ((errors == NULL) \|\|
				1151	(strcmp(errors,"strict") == 0)) {
				1152	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1153	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1154	details);
				1155	return -1;
				1156	}
				1157	else if (strcmp(errors,"ignore") == 0) {
				1158	return 0;
				1159	}
				1160	else if (strcmp(errors,"replace") == 0) {
				1161	**dest = '?';
				1162	(*dest)++;
				1163	return 0;
				1164	}
				1165	else {
				1166	PyErr_Format(PyExc_ValueError,
				1167	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1168	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169	errors);
				1170	return -1;
				1171	}
				1172	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1173	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1174
				1175	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				1176	int size,
				1177	const char *errors)
				1178	{
				1179	PyObject *v;
				1180	char *p;
				1181	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1182	Py_UCS4 ch2;
				1183	unsigned int cbAllocated = 3 * size;
				1184	unsigned int cbWritten = 0;
				1185	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1186
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1187	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1188	if (v == NULL)
				1189	return NULL;
				1190	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1191	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1192
				1193	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1194	while (i < size) {
				1195	Py_UCS4 ch = s[i++];
				1196	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1197	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1198	cbWritten++;
				1199	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1200	else if (ch < 0x0800) {
				1201	*p++ = 0xc0 \| (ch >> 6);
				1202	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1203	cbWritten += 2;
				1204	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1205	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1206	/* Check for high surrogate */
				1207	if (0xD800 <= ch && ch <= 0xDBFF) {
				1208	if (i != size) {
				1209	ch2 = s[i];
				1210	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				1211
				1212	if (cbWritten >= (cbAllocated - 4)) {
				1213	/* Provide enough room for some more
				1214	surrogates */
				1215	cbAllocated += 4*10;
				1216	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1217	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1218	}
				1219
				1220	/* combine the two values */
				1221	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				1222
				1223	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1224	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1225	i++;
				1226	cbWritten += 4;
				1227	}
				1228	}
				1229	}
				1230	else {
				1231	*p++ = (char)(0xe0 \| (ch >> 12));
				1232	cbWritten += 3;
				1233	}
				1234	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1235	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1236	} else {
				1237	*p++ = 0xf0 \| (ch>>18);
				1238	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				1239	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				1240	*p++ = 0x80 \| (ch & 0x3f);
				1241	cbWritten += 4;
				1242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1243	}
				1244	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1245	if (_PyString_Resize(&v, p - q))
				1246	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1247	return v;
				1248
				1249	onError:
				1250	Py_DECREF(v);
				1251	return NULL;
				1252	}
				1253
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1254	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1255	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1256	if (!PyUnicode_Check(unicode)) {
				1257	PyErr_BadArgument();
				1258	return NULL;
				1259	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1260	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1261	PyUnicode_GET_SIZE(unicode),
				1262	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1263	}
				1264
				1265	/* --- UTF-16 Codec ------------------------------------------------------- */
				1266
				1267	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1268	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1269	const char *errors,
				1270	const char *details)
				1271	{
				1272	if ((errors == NULL) \|\|
				1273	(strcmp(errors,"strict") == 0)) {
				1274	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1275	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1276	details);
				1277	return -1;
				1278	}
				1279	else if (strcmp(errors,"ignore") == 0) {
				1280	return 0;
				1281	}
				1282	else if (strcmp(errors,"replace") == 0) {
				1283	if (dest) {
				1284	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1285	(*dest)++;
				1286	}
				1287	return 0;
				1288	}
				1289	else {
				1290	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1291	"UTF-16 decoding error; "
				1292	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1293	errors);
				1294	return -1;
				1295	}
				1296	}
				1297
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1298	PyObject *
				1299	PyUnicode_DecodeUTF16(const char *s,
				1300	int size,
				1301	const char *errors,
				1302	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1303	{
				1304	PyUnicodeObject *unicode;
				1305	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1306	const unsigned char q, e;
				1307	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1308	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1309	/* Offsets from q for retrieving byte pairs in the right order. */
				1310	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1311	int ihi = 1, ilo = 0;
				1312	#else
				1313	int ihi = 0, ilo = 1;
				1314	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1315
				1316	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1317	if (size & 1) {
				1318	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1319	return NULL;
				1320	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1321	}
				1322
				1323	/* Note: size will always be longer than the resulting Unicode
				1324	character count */
				1325	unicode = _PyUnicode_New(size);
				1326	if (!unicode)
				1327	return NULL;
				1328	if (size == 0)
				1329	return (PyObject *)unicode;
				1330
				1331	/* Unpack UTF-16 encoded data */
				1332	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1333	q = (unsigned char *)s;
				1334	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1335
				1336	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1337	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1338
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1339	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1340	byte order setting accordingly. In native mode, the leading BOM
				1341	mark is skipped, in all other modes, it is copied to the output
				1342	stream as-is (giving a ZWNBSP character). */
				1343	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1344	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1345	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1346	if (bom == 0xFEFF) {
				1347	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1348	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1349	}
				1350	else if (bom == 0xFFFE) {
				1351	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1352	bo = 1;
				1353	}
				1354	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1355	if (bom == 0xFEFF) {
				1356	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1357	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1358	}
				1359	else if (bom == 0xFFFE) {
				1360	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1361	bo = -1;
				1362	}
				1363	#endif
				1364	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1365
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1366	if (bo == -1) {
				1367	/* force LE */
				1368	ihi = 1;
				1369	ilo = 0;
				1370	}
				1371	else if (bo == 1) {
				1372	/* force BE */
				1373	ihi = 0;
				1374	ilo = 1;
				1375	}
				1376
				1377	while (q < e) {
				1378	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1379	q += 2;
				1380
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1381	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1382	*p++ = ch;
				1383	continue;
				1384	}
				1385
				1386	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1387	if (q >= e) {
				1388	errmsg = "unexpected end of data";
				1389	goto utf16Error;
				1390	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1391	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1392	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1393	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1394	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1395	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1396	*p++ = ch;
				1397	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1398	#else
				1399	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1400	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1401	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1402	}
				1403	else {
				1404	errmsg = "illegal UTF-16 surrogate";
				1405	goto utf16Error;
				1406	}
				1407
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1408	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1409	errmsg = "illegal encoding";
				1410	/* Fall through to report the error */
				1411
				1412	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1413	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1414	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	}
				1416
				1417	if (byteorder)
				1418	*byteorder = bo;
				1419
				1420	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1421	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1422	goto onError;
				1423
				1424	return (PyObject *)unicode;
				1425
				1426	onError:
				1427	Py_DECREF(unicode);
				1428	return NULL;
				1429	}
				1430
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1431	PyObject *
				1432	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1433	int size,
				1434	const char *errors,
				1435	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1436	{
				1437	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1438	unsigned char *p;
				1439	int i, pairs;
				1440	/* Offsets from p for storing byte pairs in the right order. */
				1441	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1442	int ihi = 1, ilo = 0;
				1443	#else
				1444	int ihi = 0, ilo = 1;
				1445	#endif
				1446
				1447	#define STORECHAR(CH) \
				1448	do { \
				1449	p[ihi] = ((CH) >> 8) & 0xff; \
				1450	p[ilo] = (CH) & 0xff; \
				1451	p += 2; \
				1452	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1453
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1454	for (i = pairs = 0; i < size; i++)
				1455	if (s[i] >= 0x10000)
				1456	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1457	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1458	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1459	if (v == NULL)
				1460	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1461
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1462	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1463	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1464	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1465	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1466	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1467
				1468	if (byteorder == -1) {
				1469	/* force LE */
				1470	ihi = 1;
				1471	ilo = 0;
				1472	}
				1473	else if (byteorder == 1) {
				1474	/* force BE */
				1475	ihi = 0;
				1476	ilo = 1;
				1477	}
				1478
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1479	while (size-- > 0) {
				1480	Py_UNICODE ch = *s++;
				1481	Py_UNICODE ch2 = 0;
				1482	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1483	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1484	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1486	STORECHAR(ch);
				1487	if (ch2)
				1488	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1489	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1490	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1491	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1492	}
				1493
				1494	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1495	{
				1496	if (!PyUnicode_Check(unicode)) {
				1497	PyErr_BadArgument();
				1498	return NULL;
				1499	}
				1500	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1501	PyUnicode_GET_SIZE(unicode),
				1502	NULL,
				1503	0);
				1504	}
				1505
				1506	/* --- Unicode Escape Codec ----------------------------------------------- */
				1507
				1508	static
				1509	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1510	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1511	const char *errors,
				1512	const char *details)
				1513	{
				1514	if ((errors == NULL) \|\|
				1515	(strcmp(errors,"strict") == 0)) {
				1516	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1517	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1518	details);
				1519	return -1;
				1520	}
				1521	else if (strcmp(errors,"ignore") == 0) {
				1522	return 0;
				1523	}
				1524	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1525	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1526	return 0;
				1527	}
				1528	else {
				1529	PyErr_Format(PyExc_ValueError,
				1530	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1531	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1532	errors);
				1533	return -1;
				1534	}
				1535	}
				1536
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1537	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1538
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1539	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1540	int size,
				1541	const char *errors)
				1542	{
				1543	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1544	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1545	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1546	char* message;
				1547	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1548
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1549	/* Escaped strings will always be longer than the resulting
				1550	Unicode string, so we start with size here and then reduce the
				1551	length after conversion to the true value. */
				1552	v = _PyUnicode_New(size);
				1553	if (v == NULL)
				1554	goto onError;
				1555	if (size == 0)
				1556	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1557
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1558	p = buf = PyUnicode_AS_UNICODE(v);
				1559	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1560
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1561	while (s < end) {
				1562	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1563	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1564	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1565
				1566	/* Non-escape characters are interpreted as Unicode ordinals */
				1567	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1568	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1569	continue;
				1570	}
				1571
				1572	/* \ - Escapes */
				1573	s++;
				1574	switch (*s++) {
				1575
				1576	/* \x escapes */
				1577	case '\n': break;
				1578	case '\\': *p++ = '\\'; break;
				1579	case '\'': *p++ = '\''; break;
				1580	case '\"': *p++ = '\"'; break;
				1581	case 'b': *p++ = '\b'; break;
				1582	case 'f': p++ = '\014'; break; / FF */
				1583	case 't': *p++ = '\t'; break;
				1584	case 'n': *p++ = '\n'; break;
				1585	case 'r': *p++ = '\r'; break;
				1586	case 'v': p++ = '\013'; break; / VT */
				1587	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1588
				1589	/* \OOO (octal) escapes */
				1590	case '0': case '1': case '2': case '3':
				1591	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1592	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1593	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1594	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1595	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1596	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1597	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1598	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1599	break;
				1600
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1601	/* hex escapes */
				1602	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1603	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1604	digits = 2;
				1605	message = "truncated \\xXX escape";
				1606	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1607
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1608	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1609	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1610	digits = 4;
				1611	message = "truncated \\uXXXX escape";
				1612	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1613
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1614	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1615	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1616	digits = 8;
				1617	message = "truncated \\UXXXXXXXX escape";
				1618	hexescape:
				1619	chr = 0;
				1620	for (i = 0; i < digits; i++) {
				1621	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1622	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1623	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1624	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1625	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1626	i++;
				1627	break;
				1628	}
				1629	chr = (chr<<4) & ~0xF;
				1630	if (c >= '0' && c <= '9')
				1631	chr += c - '0';
				1632	else if (c >= 'a' && c <= 'f')
				1633	chr += 10 + c - 'a';
				1634	else
				1635	chr += 10 + c - 'A';
				1636	}
				1637	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1638	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1639	/* when we get here, chr is a 32-bit unicode character */
				1640	if (chr <= 0xffff)
				1641	/* UCS-2 character */
				1642	*p++ = (Py_UNICODE) chr;
				1643	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1644	/* UCS-4 character. Either store directly, or as
				1645	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1646	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1647	*p++ = chr;
				1648	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1649	chr -= 0x10000L;
				1650	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1651	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1652	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1653	} else {
				1654	if (unicodeescape_decoding_error(
				1655	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1656	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1657	)
				1658	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1659	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1660	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1661	break;
				1662
				1663	/* \N{name} */
				1664	case 'N':
				1665	message = "malformed \\N character escape";
				1666	if (ucnhash_CAPI == NULL) {
				1667	/* load the unicode data module */
				1668	PyObject m, v;
				1669	m = PyImport_ImportModule("unicodedata");
				1670	if (m == NULL)
				1671	goto ucnhashError;
				1672	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1673	Py_DECREF(m);
				1674	if (v == NULL)
				1675	goto ucnhashError;
				1676	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1677	Py_DECREF(v);
				1678	if (ucnhash_CAPI == NULL)
				1679	goto ucnhashError;
				1680	}
				1681	if (*s == '{') {
				1682	const char *start = s+1;
				1683	/* look for the closing brace */
				1684	while (*s != '}' && s < end)
				1685	s++;
				1686	if (s > start && s < end && *s == '}') {
				1687	/* found a name. look it up in the unicode database */
				1688	message = "unknown Unicode character name";
				1689	s++;
				1690	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1691	goto store;
				1692	}
				1693	}
				1694	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1695	goto onError;
				1696	*p++ = x;
				1697	break;
				1698
				1699	default:
				1700	*p++ = '\\';
				1701	*p++ = (unsigned char)s[-1];
				1702	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1703	}
				1704	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1705	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1706	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1707	return (PyObject *)v;
				1708
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1709	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1710	PyErr_SetString(
				1711	PyExc_UnicodeError,
				1712	"\\N escapes not supported (can't load unicodedata module)"
				1713	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1714	return NULL;
				1715
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1716	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1717	Py_XDECREF(v);
				1718	return NULL;
				1719	}
				1720
				1721	/* Return a Unicode-Escape string version of the Unicode object.
				1722
				1723	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1724	appropriate.
				1725
				1726	*/
				1727
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1728	static const Py_UNICODE findchar(const Py_UNICODE s,
				1729	int size,
				1730	Py_UNICODE ch);
				1731
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1732	static
				1733	PyObject unicodeescape_string(const Py_UNICODE s,
				1734	int size,
				1735	int quotes)
				1736	{
				1737	PyObject *repr;
				1738	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1739
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1740	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1741
				1742	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1743	if (repr == NULL)
				1744	return NULL;
				1745
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1746	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1747
				1748	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1749	*p++ = 'u';
				1750	*p++ = (findchar(s, size, '\'') &&
				1751	!findchar(s, size, '"')) ? '"' : '\'';
				1752	}
				1753	while (size-- > 0) {
				1754	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1755
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1756	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1757	if (quotes &&
				1758	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1759	*p++ = '\\';
				1760	*p++ = (char) ch;
				1761	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1762
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1763	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1764	/* Map 21-bit characters to '\U00xxxxxx' */
				1765	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1766	int offset = p - PyString_AS_STRING(repr);
				1767
				1768	/* Resize the string if necessary */
				1769	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1770	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1771	goto onError;
				1772	p = PyString_AS_STRING(repr) + offset;
				1773	}
				1774
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1775	*p++ = '\\';
				1776	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1777	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1778	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1779	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1780	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1781	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1782	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1783	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1784	*p++ = hexdigit[ch & 0x0000000F];
				1785	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1786	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1787	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1788	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1789	else if (ch >= 0xD800 && ch < 0xDC00) {
				1790	Py_UNICODE ch2;
				1791	Py_UCS4 ucs;
				1792
				1793	ch2 = *s++;
				1794	size--;
				1795	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1796	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1797	*p++ = '\\';
				1798	*p++ = 'U';
				1799	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1800	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1801	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1802	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1803	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1804	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1805	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1806	*p++ = hexdigit[ucs & 0x0000000F];
				1807	continue;
				1808	}
				1809	/* Fall through: isolated surrogates are copied as-is */
				1810	s--;
				1811	size++;
				1812	}
				1813
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1814	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1815	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1816	*p++ = '\\';
				1817	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1818	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1819	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1820	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1821	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1822	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1823
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1824	/* Map special whitespace to '\t', \n', '\r' */
				1825	else if (ch == '\t') {
				1826	*p++ = '\\';
				1827	*p++ = 't';
				1828	}
				1829	else if (ch == '\n') {
				1830	*p++ = '\\';
				1831	*p++ = 'n';
				1832	}
				1833	else if (ch == '\r') {
				1834	*p++ = '\\';
				1835	*p++ = 'r';
				1836	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1837
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1838	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1839	else if (ch < ' ' \|\| ch >= 128) {
				1840	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1841	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1842	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1843	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1844	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1845
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1846	/* Copy everything else as-is */
				1847	else
				1848	*p++ = (char) ch;
				1849	}
				1850	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1851	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1852
				1853	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1854	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1855	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1856
				1857	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1858
				1859	onError:
				1860	Py_DECREF(repr);
				1861	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1862	}
				1863
				1864	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1865	int size)
				1866	{
				1867	return unicodeescape_string(s, size, 0);
				1868	}
				1869
				1870	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1871	{
				1872	if (!PyUnicode_Check(unicode)) {
				1873	PyErr_BadArgument();
				1874	return NULL;
				1875	}
				1876	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1877	PyUnicode_GET_SIZE(unicode));
				1878	}
				1879
				1880	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1881
				1882	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1883	int size,
				1884	const char *errors)
				1885	{
				1886	PyUnicodeObject *v;
				1887	Py_UNICODE p, buf;
				1888	const char *end;
				1889	const char *bs;
				1890
				1891	/* Escaped strings will always be longer than the resulting
				1892	Unicode string, so we start with size here and then reduce the
				1893	length after conversion to the true value. */
				1894	v = _PyUnicode_New(size);
				1895	if (v == NULL)
				1896	goto onError;
				1897	if (size == 0)
				1898	return (PyObject *)v;
				1899	p = buf = PyUnicode_AS_UNICODE(v);
				1900	end = s + size;
				1901	while (s < end) {
				1902	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1903	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1904	int i;
				1905
				1906	/* Non-escape characters are interpreted as Unicode ordinals */
				1907	if (*s != '\\') {
				1908	p++ = (unsigned char)s++;
				1909	continue;
				1910	}
				1911
				1912	/* \u-escapes are only interpreted iff the number of leading
				1913	backslashes if odd */
				1914	bs = s;
				1915	for (;s < end;) {
				1916	if (*s != '\\')
				1917	break;
				1918	p++ = (unsigned char)s++;
				1919	}
				1920	if (((s - bs) & 1) == 0 \|\|
				1921	s >= end \|\|
				1922	*s != 'u') {
				1923	continue;
				1924	}
				1925	p--;
				1926	s++;
				1927
				1928	/* \uXXXX with 4 hex digits */
				1929	for (x = 0, i = 0; i < 4; i++) {
				1930	c = (unsigned char)s[i];
				1931	if (!isxdigit(c)) {
				1932	if (unicodeescape_decoding_error(&s, &x, errors,
				1933	"truncated \\uXXXX"))
				1934	goto onError;
				1935	i++;
				1936	break;
				1937	}
				1938	x = (x<<4) & ~0xF;
				1939	if (c >= '0' && c <= '9')
				1940	x += c - '0';
				1941	else if (c >= 'a' && c <= 'f')
				1942	x += 10 + c - 'a';
				1943	else
				1944	x += 10 + c - 'A';
				1945	}
				1946	s += i;
				1947	*p++ = x;
				1948	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1949	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1950	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1951	return (PyObject *)v;
				1952
				1953	onError:
				1954	Py_XDECREF(v);
				1955	return NULL;
				1956	}
				1957
				1958	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1959	int size)
				1960	{
				1961	PyObject *repr;
				1962	char *p;
				1963	char *q;
				1964
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1965	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1966
				1967	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1968	if (repr == NULL)
				1969	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1970	if (size == 0)
				1971	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1972
				1973	p = q = PyString_AS_STRING(repr);
				1974	while (size-- > 0) {
				1975	Py_UNICODE ch = *s++;
				1976	/* Map 16-bit characters to '\uxxxx' */
				1977	if (ch >= 256) {
				1978	*p++ = '\\';
				1979	*p++ = 'u';
				1980	*p++ = hexdigit[(ch >> 12) & 0xf];
				1981	*p++ = hexdigit[(ch >> 8) & 0xf];
				1982	*p++ = hexdigit[(ch >> 4) & 0xf];
				1983	*p++ = hexdigit[ch & 15];
				1984	}
				1985	/* Copy everything else as-is */
				1986	else
				1987	*p++ = (char) ch;
				1988	}
				1989	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1990	if (_PyString_Resize(&repr, p - q))
				1991	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1992
				1993	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1994
				1995	onError:
				1996	Py_DECREF(repr);
				1997	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1998	}
				1999
				2000	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				2001	{
				2002	if (!PyUnicode_Check(unicode)) {
				2003	PyErr_BadArgument();
				2004	return NULL;
				2005	}
				2006	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				2007	PyUnicode_GET_SIZE(unicode));
				2008	}
				2009
				2010	/* --- Latin-1 Codec ------------------------------------------------------ */
				2011
				2012	PyObject PyUnicode_DecodeLatin1(const char s,
				2013	int size,
				2014	const char *errors)
				2015	{
				2016	PyUnicodeObject *v;
				2017	Py_UNICODE *p;
				2018
				2019	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2020	if (size == 1 && (unsigned char)s < 256) {
				2021	Py_UNICODE r = (unsigned char)s;
				2022	return PyUnicode_FromUnicode(&r, 1);
				2023	}
				2024
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2025	v = _PyUnicode_New(size);
				2026	if (v == NULL)
				2027	goto onError;
				2028	if (size == 0)
				2029	return (PyObject *)v;
				2030	p = PyUnicode_AS_UNICODE(v);
				2031	while (size-- > 0)
				2032	p++ = (unsigned char)s++;
				2033	return (PyObject *)v;
				2034
				2035	onError:
				2036	Py_XDECREF(v);
				2037	return NULL;
				2038	}
				2039
				2040	static
				2041	int latin1_encoding_error(const Py_UNICODE **source,
				2042	char **dest,
				2043	const char *errors,
				2044	const char *details)
				2045	{
				2046	if ((errors == NULL) \|\|
				2047	(strcmp(errors,"strict") == 0)) {
				2048	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2049	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2050	details);
				2051	return -1;
				2052	}
				2053	else if (strcmp(errors,"ignore") == 0) {
				2054	return 0;
				2055	}
				2056	else if (strcmp(errors,"replace") == 0) {
				2057	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2058	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2059	return 0;
				2060	}
				2061	else {
				2062	PyErr_Format(PyExc_ValueError,
				2063	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2064	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2065	errors);
				2066	return -1;
				2067	}
				2068	}
				2069
				2070	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2071	int size,
				2072	const char *errors)
				2073	{
				2074	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2075	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2076
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2077	repr = PyString_FromStringAndSize(NULL, size);
				2078	if (repr == NULL)
				2079	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2080	if (size == 0)
				2081	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2082
				2083	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2084	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2085	while (size-- > 0) {
				2086	Py_UNICODE ch = *p++;
				2087	if (ch >= 256) {
				2088	if (latin1_encoding_error(&p, &s, errors,
				2089	"ordinal not in range(256)"))
				2090	goto onError;
				2091	}
				2092	else
				2093	*s++ = (char)ch;
				2094	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2095	/* Resize if error handling skipped some characters */
				2096	if (s - start < PyString_GET_SIZE(repr))
				2097	if (_PyString_Resize(&repr, s - start))
				2098	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2099	return repr;
				2100
				2101	onError:
				2102	Py_DECREF(repr);
				2103	return NULL;
				2104	}
				2105
				2106	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2107	{
				2108	if (!PyUnicode_Check(unicode)) {
				2109	PyErr_BadArgument();
				2110	return NULL;
				2111	}
				2112	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2113	PyUnicode_GET_SIZE(unicode),
				2114	NULL);
				2115	}
				2116
				2117	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2118
				2119	static
				2120	int ascii_decoding_error(const char **source,
				2121	Py_UNICODE **dest,
				2122	const char *errors,
				2123	const char *details)
				2124	{
				2125	if ((errors == NULL) \|\|
				2126	(strcmp(errors,"strict") == 0)) {
				2127	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2128	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2129	details);
				2130	return -1;
				2131	}
				2132	else if (strcmp(errors,"ignore") == 0) {
				2133	return 0;
				2134	}
				2135	else if (strcmp(errors,"replace") == 0) {
				2136	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2137	(*dest)++;
				2138	return 0;
				2139	}
				2140	else {
				2141	PyErr_Format(PyExc_ValueError,
				2142	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2143	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2144	errors);
				2145	return -1;
				2146	}
				2147	}
				2148
				2149	PyObject PyUnicode_DecodeASCII(const char s,
				2150	int size,
				2151	const char *errors)
				2152	{
				2153	PyUnicodeObject *v;
				2154	Py_UNICODE *p;
				2155
				2156	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2157	if (size == 1 && (unsigned char)s < 128) {
				2158	Py_UNICODE r = (unsigned char)s;
				2159	return PyUnicode_FromUnicode(&r, 1);
				2160	}
				2161
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2162	v = _PyUnicode_New(size);
				2163	if (v == NULL)
				2164	goto onError;
				2165	if (size == 0)
				2166	return (PyObject *)v;
				2167	p = PyUnicode_AS_UNICODE(v);
				2168	while (size-- > 0) {
				2169	register unsigned char c;
				2170
				2171	c = (unsigned char)*s++;
				2172	if (c < 128)
				2173	*p++ = c;
				2174	else if (ascii_decoding_error(&s, &p, errors,
				2175	"ordinal not in range(128)"))
				2176	goto onError;
				2177	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2178	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2179	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2180	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2181	return (PyObject *)v;
				2182
				2183	onError:
				2184	Py_XDECREF(v);
				2185	return NULL;
				2186	}
				2187
				2188	static
				2189	int ascii_encoding_error(const Py_UNICODE **source,
				2190	char **dest,
				2191	const char *errors,
				2192	const char *details)
				2193	{
				2194	if ((errors == NULL) \|\|
				2195	(strcmp(errors,"strict") == 0)) {
				2196	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2197	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2198	details);
				2199	return -1;
				2200	}
				2201	else if (strcmp(errors,"ignore") == 0) {
				2202	return 0;
				2203	}
				2204	else if (strcmp(errors,"replace") == 0) {
				2205	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2206	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2207	return 0;
				2208	}
				2209	else {
				2210	PyErr_Format(PyExc_ValueError,
				2211	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2212	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2213	errors);
				2214	return -1;
				2215	}
				2216	}
				2217
				2218	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2219	int size,
				2220	const char *errors)
				2221	{
				2222	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2223	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2224
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2225	repr = PyString_FromStringAndSize(NULL, size);
				2226	if (repr == NULL)
				2227	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2228	if (size == 0)
				2229	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2230
				2231	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2232	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2233	while (size-- > 0) {
				2234	Py_UNICODE ch = *p++;
				2235	if (ch >= 128) {
				2236	if (ascii_encoding_error(&p, &s, errors,
				2237	"ordinal not in range(128)"))
				2238	goto onError;
				2239	}
				2240	else
				2241	*s++ = (char)ch;
				2242	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2243	/* Resize if error handling skipped some characters */
				2244	if (s - start < PyString_GET_SIZE(repr))
				2245	if (_PyString_Resize(&repr, s - start))
				2246	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2247	return repr;
				2248
				2249	onError:
				2250	Py_DECREF(repr);
				2251	return NULL;
				2252	}
				2253
				2254	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2255	{
				2256	if (!PyUnicode_Check(unicode)) {
				2257	PyErr_BadArgument();
				2258	return NULL;
				2259	}
				2260	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2261	PyUnicode_GET_SIZE(unicode),
				2262	NULL);
				2263	}
				2264
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2265	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2266
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2267	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2268
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2269	PyObject PyUnicode_DecodeMBCS(const char s,
				2270	int size,
				2271	const char *errors)
				2272	{
				2273	PyUnicodeObject *v;
				2274	Py_UNICODE *p;
				2275
				2276	/* First get the size of the result */
				2277	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2278	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2279	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2280
				2281	v = _PyUnicode_New(usize);
				2282	if (v == NULL)
				2283	return NULL;
				2284	if (usize == 0)
				2285	return (PyObject *)v;
				2286	p = PyUnicode_AS_UNICODE(v);
				2287	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2288	Py_DECREF(v);
				2289	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2290	}
				2291
				2292	return (PyObject *)v;
				2293	}
				2294
				2295	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2296	int size,
				2297	const char *errors)
				2298	{
				2299	PyObject *repr;
				2300	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2301	DWORD mbcssize;
				2302
				2303	/* If there are no characters, bail now! */
				2304	if (size==0)
				2305	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2306
				2307	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2308	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2309	if (mbcssize==0)
				2310	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2311
				2312	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2313	if (repr == NULL)
				2314	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2315	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2316	return repr;
				2317
				2318	/* Do the conversion */
				2319	s = PyString_AS_STRING(repr);
				2320	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2321	Py_DECREF(repr);
				2322	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2323	}
				2324	return repr;
				2325	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2326
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2327	#endif /* MS_WIN32 */
				2328
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2329	/* --- Character Mapping Codec -------------------------------------------- */
				2330
				2331	static
				2332	int charmap_decoding_error(const char **source,
				2333	Py_UNICODE **dest,
				2334	const char *errors,
				2335	const char *details)
				2336	{
				2337	if ((errors == NULL) \|\|
				2338	(strcmp(errors,"strict") == 0)) {
				2339	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2340	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2341	details);
				2342	return -1;
				2343	}
				2344	else if (strcmp(errors,"ignore") == 0) {
				2345	return 0;
				2346	}
				2347	else if (strcmp(errors,"replace") == 0) {
				2348	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2349	(*dest)++;
				2350	return 0;
				2351	}
				2352	else {
				2353	PyErr_Format(PyExc_ValueError,
				2354	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2355	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2356	errors);
				2357	return -1;
				2358	}
				2359	}
				2360
				2361	PyObject PyUnicode_DecodeCharmap(const char s,
				2362	int size,
				2363	PyObject *mapping,
				2364	const char *errors)
				2365	{
				2366	PyUnicodeObject *v;
				2367	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2368	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2369
				2370	/* Default to Latin-1 */
				2371	if (mapping == NULL)
				2372	return PyUnicode_DecodeLatin1(s, size, errors);
				2373
				2374	v = _PyUnicode_New(size);
				2375	if (v == NULL)
				2376	goto onError;
				2377	if (size == 0)
				2378	return (PyObject *)v;
				2379	p = PyUnicode_AS_UNICODE(v);
				2380	while (size-- > 0) {
				2381	unsigned char ch = *s++;
				2382	PyObject w, x;
				2383
				2384	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2385	w = PyInt_FromLong((long)ch);
				2386	if (w == NULL)
				2387	goto onError;
				2388	x = PyObject_GetItem(mapping, w);
				2389	Py_DECREF(w);
				2390	if (x == NULL) {
				2391	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2392	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2393	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2394	x = Py_None;
				2395	Py_INCREF(x);
				2396	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2397	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2398	}
				2399
				2400	/* Apply mapping */
				2401	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2402	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2403	if (value < 0 \|\| value > 65535) {
				2404	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2405	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2406	Py_DECREF(x);
				2407	goto onError;
				2408	}
				2409	*p++ = (Py_UNICODE)value;
				2410	}
				2411	else if (x == Py_None) {
				2412	/* undefined mapping */
				2413	if (charmap_decoding_error(&s, &p, errors,
				2414	"character maps to <undefined>")) {
				2415	Py_DECREF(x);
				2416	goto onError;
				2417	}
				2418	}
				2419	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2420	int targetsize = PyUnicode_GET_SIZE(x);
				2421
				2422	if (targetsize == 1)
				2423	/* 1-1 mapping */
				2424	p++ = PyUnicode_AS_UNICODE(x);
				2425
				2426	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2427	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2428	if (targetsize > extrachars) {
				2429	/* resize first */
				2430	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2431	int needed = (targetsize - extrachars) + \
				2432	(targetsize << 2);
				2433	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2434	if (_PyUnicode_Resize(&v,
				2435	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2436	Py_DECREF(x);
				2437	goto onError;
				2438	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2439	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2440	}
				2441	Py_UNICODE_COPY(p,
				2442	PyUnicode_AS_UNICODE(x),
				2443	targetsize);
				2444	p += targetsize;
				2445	extrachars -= targetsize;
				2446	}
				2447	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2448	}
				2449	else {
				2450	/* wrong return value */
				2451	PyErr_SetString(PyExc_TypeError,
				2452	"character mapping must return integer, None or unicode");
				2453	Py_DECREF(x);
				2454	goto onError;
				2455	}
				2456	Py_DECREF(x);
				2457	}
				2458	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2459	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2460	goto onError;
				2461	return (PyObject *)v;
				2462
				2463	onError:
				2464	Py_XDECREF(v);
				2465	return NULL;
				2466	}
				2467
				2468	static
				2469	int charmap_encoding_error(const Py_UNICODE **source,
				2470	char **dest,
				2471	const char *errors,
				2472	const char *details)
				2473	{
				2474	if ((errors == NULL) \|\|
				2475	(strcmp(errors,"strict") == 0)) {
				2476	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2477	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2478	details);
				2479	return -1;
				2480	}
				2481	else if (strcmp(errors,"ignore") == 0) {
				2482	return 0;
				2483	}
				2484	else if (strcmp(errors,"replace") == 0) {
				2485	**dest = '?';
				2486	(*dest)++;
				2487	return 0;
				2488	}
				2489	else {
				2490	PyErr_Format(PyExc_ValueError,
				2491	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2492	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2493	errors);
				2494	return -1;
				2495	}
				2496	}
				2497
				2498	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2499	int size,
				2500	PyObject *mapping,
				2501	const char *errors)
				2502	{
				2503	PyObject *v;
				2504	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2505	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2506
				2507	/* Default to Latin-1 */
				2508	if (mapping == NULL)
				2509	return PyUnicode_EncodeLatin1(p, size, errors);
				2510
				2511	v = PyString_FromStringAndSize(NULL, size);
				2512	if (v == NULL)
				2513	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2514	if (size == 0)
				2515	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2516	s = PyString_AS_STRING(v);
				2517	while (size-- > 0) {
				2518	Py_UNICODE ch = *p++;
				2519	PyObject w, x;
				2520
				2521	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2522	w = PyInt_FromLong((long)ch);
				2523	if (w == NULL)
				2524	goto onError;
				2525	x = PyObject_GetItem(mapping, w);
				2526	Py_DECREF(w);
				2527	if (x == NULL) {
				2528	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2529	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2530	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2531	x = Py_None;
				2532	Py_INCREF(x);
				2533	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2534	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2535	}
				2536
				2537	/* Apply mapping */
				2538	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2539	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2540	if (value < 0 \|\| value > 255) {
				2541	PyErr_SetString(PyExc_TypeError,
				2542	"character mapping must be in range(256)");
				2543	Py_DECREF(x);
				2544	goto onError;
				2545	}
				2546	*s++ = (char)value;
				2547	}
				2548	else if (x == Py_None) {
				2549	/* undefined mapping */
				2550	if (charmap_encoding_error(&p, &s, errors,
				2551	"character maps to <undefined>")) {
				2552	Py_DECREF(x);
				2553	goto onError;
				2554	}
				2555	}
				2556	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2557	int targetsize = PyString_GET_SIZE(x);
				2558
				2559	if (targetsize == 1)
				2560	/* 1-1 mapping */
				2561	s++ = PyString_AS_STRING(x);
				2562
				2563	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2564	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2565	if (targetsize > extrachars) {
				2566	/* resize first */
				2567	int oldpos = (int)(s - PyString_AS_STRING(v));
				2568	int needed = (targetsize - extrachars) + \
				2569	(targetsize << 2);
				2570	extrachars += needed;
				2571	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2572	Py_DECREF(x);
				2573	goto onError;
				2574	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2575	s = PyString_AS_STRING(v) + oldpos;
				2576	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2577	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2578	s += targetsize;
				2579	extrachars -= targetsize;
				2580	}
				2581	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2582	}
				2583	else {
				2584	/* wrong return value */
				2585	PyErr_SetString(PyExc_TypeError,
				2586	"character mapping must return integer, None or unicode");
				2587	Py_DECREF(x);
				2588	goto onError;
				2589	}
				2590	Py_DECREF(x);
				2591	}
				2592	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2593	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2594	goto onError;
				2595	return v;
				2596
				2597	onError:
				2598	Py_DECREF(v);
				2599	return NULL;
				2600	}
				2601
				2602	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2603	PyObject *mapping)
				2604	{
				2605	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2606	PyErr_BadArgument();
				2607	return NULL;
				2608	}
				2609	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2610	PyUnicode_GET_SIZE(unicode),
				2611	mapping,
				2612	NULL);
				2613	}
				2614
				2615	static
				2616	int translate_error(const Py_UNICODE **source,
				2617	Py_UNICODE **dest,
				2618	const char *errors,
				2619	const char *details)
				2620	{
				2621	if ((errors == NULL) \|\|
				2622	(strcmp(errors,"strict") == 0)) {
				2623	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2624	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2625	details);
				2626	return -1;
				2627	}
				2628	else if (strcmp(errors,"ignore") == 0) {
				2629	return 0;
				2630	}
				2631	else if (strcmp(errors,"replace") == 0) {
				2632	**dest = '?';
				2633	(*dest)++;
				2634	return 0;
				2635	}
				2636	else {
				2637	PyErr_Format(PyExc_ValueError,
				2638	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2639	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2640	errors);
				2641	return -1;
				2642	}
				2643	}
				2644
				2645	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2646	int size,
				2647	PyObject *mapping,
				2648	const char *errors)
				2649	{
				2650	PyUnicodeObject *v;
				2651	Py_UNICODE *p;
				2652
				2653	if (mapping == NULL) {
				2654	PyErr_BadArgument();
				2655	return NULL;
				2656	}
				2657
				2658	/* Output will never be longer than input */
				2659	v = _PyUnicode_New(size);
				2660	if (v == NULL)
				2661	goto onError;
				2662	if (size == 0)
				2663	goto done;
				2664	p = PyUnicode_AS_UNICODE(v);
				2665	while (size-- > 0) {
				2666	Py_UNICODE ch = *s++;
				2667	PyObject w, x;
				2668
				2669	/* Get mapping */
				2670	w = PyInt_FromLong(ch);
				2671	if (w == NULL)
				2672	goto onError;
				2673	x = PyObject_GetItem(mapping, w);
				2674	Py_DECREF(w);
				2675	if (x == NULL) {
				2676	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2677	/* No mapping found: default to 1-1 mapping */
				2678	PyErr_Clear();
				2679	*p++ = ch;
				2680	continue;
				2681	}
				2682	goto onError;
				2683	}
				2684
				2685	/* Apply mapping */
				2686	if (PyInt_Check(x))
				2687	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2688	else if (x == Py_None) {
				2689	/* undefined mapping */
				2690	if (translate_error(&s, &p, errors,
				2691	"character maps to <undefined>")) {
				2692	Py_DECREF(x);
				2693	goto onError;
				2694	}
				2695	}
				2696	else if (PyUnicode_Check(x)) {
				2697	if (PyUnicode_GET_SIZE(x) != 1) {
				2698	/* 1-n mapping */
				2699	PyErr_SetString(PyExc_NotImplementedError,
				2700	"1-n mappings are currently not implemented");
				2701	Py_DECREF(x);
				2702	goto onError;
				2703	}
				2704	p++ = PyUnicode_AS_UNICODE(x);
				2705	}
				2706	else {
				2707	/* wrong return value */
				2708	PyErr_SetString(PyExc_TypeError,
				2709	"translate mapping must return integer, None or unicode");
				2710	Py_DECREF(x);
				2711	goto onError;
				2712	}
				2713	Py_DECREF(x);
				2714	}
				2715	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2716	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2717	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2718
				2719	done:
				2720	return (PyObject *)v;
				2721
				2722	onError:
				2723	Py_XDECREF(v);
				2724	return NULL;
				2725	}
				2726
				2727	PyObject PyUnicode_Translate(PyObject str,
				2728	PyObject *mapping,
				2729	const char *errors)
				2730	{
				2731	PyObject *result;
				2732
				2733	str = PyUnicode_FromObject(str);
				2734	if (str == NULL)
				2735	goto onError;
				2736	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2737	PyUnicode_GET_SIZE(str),
				2738	mapping,
				2739	errors);
				2740	Py_DECREF(str);
				2741	return result;
				2742
				2743	onError:
				2744	Py_XDECREF(str);
				2745	return NULL;
				2746	}
				2747
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2748	/* --- Decimal Encoder ---------------------------------------------------- */
				2749
				2750	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2751	int length,
				2752	char *output,
				2753	const char *errors)
				2754	{
				2755	Py_UNICODE p, end;
				2756
				2757	if (output == NULL) {
				2758	PyErr_BadArgument();
				2759	return -1;
				2760	}
				2761
				2762	p = s;
				2763	end = s + length;
				2764	while (p < end) {
				2765	register Py_UNICODE ch = *p++;
				2766	int decimal;
				2767
				2768	if (Py_UNICODE_ISSPACE(ch)) {
				2769	*output++ = ' ';
				2770	continue;
				2771	}
				2772	decimal = Py_UNICODE_TODECIMAL(ch);
				2773	if (decimal >= 0) {
				2774	*output++ = '0' + decimal;
				2775	continue;
				2776	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2777	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2778	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2779	continue;
				2780	}
				2781	/* All other characters are considered invalid */
				2782	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2783	PyErr_SetString(PyExc_ValueError,
				2784	"invalid decimal Unicode string");
				2785	goto onError;
				2786	}
				2787	else if (strcmp(errors, "ignore") == 0)
				2788	continue;
				2789	else if (strcmp(errors, "replace") == 0) {
				2790	*output++ = '?';
				2791	continue;
				2792	}
				2793	}
				2794	/* 0-terminate the output string */
				2795	*output++ = '\0';
				2796	return 0;
				2797
				2798	onError:
				2799	return -1;
				2800	}
				2801
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2802	/* --- Helpers ------------------------------------------------------------ */
				2803
				2804	static
				2805	int count(PyUnicodeObject *self,
				2806	int start,
				2807	int end,
				2808	PyUnicodeObject *substring)
				2809	{
				2810	int count = 0;
				2811
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2812	if (start < 0)
				2813	start += self->length;
				2814	if (start < 0)
				2815	start = 0;
				2816	if (end > self->length)
				2817	end = self->length;
				2818	if (end < 0)
				2819	end += self->length;
				2820	if (end < 0)
				2821	end = 0;
				2822
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2823	if (substring->length == 0)
				2824	return (end - start + 1);
				2825
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2826	end -= substring->length;
				2827
				2828	while (start <= end)
				2829	if (Py_UNICODE_MATCH(self, start, substring)) {
				2830	count++;
				2831	start += substring->length;
				2832	} else
				2833	start++;
				2834
				2835	return count;
				2836	}
				2837
				2838	int PyUnicode_Count(PyObject *str,
				2839	PyObject *substr,
				2840	int start,
				2841	int end)
				2842	{
				2843	int result;
				2844
				2845	str = PyUnicode_FromObject(str);
				2846	if (str == NULL)
				2847	return -1;
				2848	substr = PyUnicode_FromObject(substr);
				2849	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2850	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2851	return -1;
				2852	}
				2853
				2854	result = count((PyUnicodeObject *)str,
				2855	start, end,
				2856	(PyUnicodeObject *)substr);
				2857
				2858	Py_DECREF(str);
				2859	Py_DECREF(substr);
				2860	return result;
				2861	}
				2862
				2863	static
				2864	int findstring(PyUnicodeObject *self,
				2865	PyUnicodeObject *substring,
				2866	int start,
				2867	int end,
				2868	int direction)
				2869	{
				2870	if (start < 0)
				2871	start += self->length;
				2872	if (start < 0)
				2873	start = 0;
				2874
				2875	if (substring->length == 0)
				2876	return start;
				2877
				2878	if (end > self->length)
				2879	end = self->length;
				2880	if (end < 0)
				2881	end += self->length;
				2882	if (end < 0)
				2883	end = 0;
				2884
				2885	end -= substring->length;
				2886
				2887	if (direction < 0) {
				2888	for (; end >= start; end--)
				2889	if (Py_UNICODE_MATCH(self, end, substring))
				2890	return end;
				2891	} else {
				2892	for (; start <= end; start++)
				2893	if (Py_UNICODE_MATCH(self, start, substring))
				2894	return start;
				2895	}
				2896
				2897	return -1;
				2898	}
				2899
				2900	int PyUnicode_Find(PyObject *str,
				2901	PyObject *substr,
				2902	int start,
				2903	int end,
				2904	int direction)
				2905	{
				2906	int result;
				2907
				2908	str = PyUnicode_FromObject(str);
				2909	if (str == NULL)
				2910	return -1;
				2911	substr = PyUnicode_FromObject(substr);
				2912	if (substr == NULL) {
				2913	Py_DECREF(substr);
				2914	return -1;
				2915	}
				2916
				2917	result = findstring((PyUnicodeObject *)str,
				2918	(PyUnicodeObject *)substr,
				2919	start, end, direction);
				2920	Py_DECREF(str);
				2921	Py_DECREF(substr);
				2922	return result;
				2923	}
				2924
				2925	static
				2926	int tailmatch(PyUnicodeObject *self,
				2927	PyUnicodeObject *substring,
				2928	int start,
				2929	int end,
				2930	int direction)
				2931	{
				2932	if (start < 0)
				2933	start += self->length;
				2934	if (start < 0)
				2935	start = 0;
				2936
				2937	if (substring->length == 0)
				2938	return 1;
				2939
				2940	if (end > self->length)
				2941	end = self->length;
				2942	if (end < 0)
				2943	end += self->length;
				2944	if (end < 0)
				2945	end = 0;
				2946
				2947	end -= substring->length;
				2948	if (end < start)
				2949	return 0;
				2950
				2951	if (direction > 0) {
				2952	if (Py_UNICODE_MATCH(self, end, substring))
				2953	return 1;
				2954	} else {
				2955	if (Py_UNICODE_MATCH(self, start, substring))
				2956	return 1;
				2957	}
				2958
				2959	return 0;
				2960	}
				2961
				2962	int PyUnicode_Tailmatch(PyObject *str,
				2963	PyObject *substr,
				2964	int start,
				2965	int end,
				2966	int direction)
				2967	{
				2968	int result;
				2969
				2970	str = PyUnicode_FromObject(str);
				2971	if (str == NULL)
				2972	return -1;
				2973	substr = PyUnicode_FromObject(substr);
				2974	if (substr == NULL) {
				2975	Py_DECREF(substr);
				2976	return -1;
				2977	}
				2978
				2979	result = tailmatch((PyUnicodeObject *)str,
				2980	(PyUnicodeObject *)substr,
				2981	start, end, direction);
				2982	Py_DECREF(str);
				2983	Py_DECREF(substr);
				2984	return result;
				2985	}
				2986
				2987	static
				2988	const Py_UNICODE findchar(const Py_UNICODE s,
				2989	int size,
				2990	Py_UNICODE ch)
				2991	{
				2992	/* like wcschr, but doesn't stop at NULL characters */
				2993
				2994	while (size-- > 0) {
				2995	if (*s == ch)
				2996	return s;
				2997	s++;
				2998	}
				2999
				3000	return NULL;
				3001	}
				3002
				3003	/* Apply fixfct filter to the Unicode object self and return a
				3004	reference to the modified object */
				3005
				3006	static
				3007	PyObject fixup(PyUnicodeObject self,
				3008	int (fixfct)(PyUnicodeObject s))
				3009	{
				3010
				3011	PyUnicodeObject *u;
				3012
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3013	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3014	if (u == NULL)
				3015	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3016
				3017	Py_UNICODE_COPY(u->str, self->str, self->length);
				3018
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3019	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3020	/* fixfct should return TRUE if it modified the buffer. If
				3021	FALSE, return a reference to the original buffer instead
				3022	(to save space, not time) */
				3023	Py_INCREF(self);
				3024	Py_DECREF(u);
				3025	return (PyObject*) self;
				3026	}
				3027	return (PyObject*) u;
				3028	}
				3029
				3030	static
				3031	int fixupper(PyUnicodeObject *self)
				3032	{
				3033	int len = self->length;
				3034	Py_UNICODE *s = self->str;
				3035	int status = 0;
				3036
				3037	while (len-- > 0) {
				3038	register Py_UNICODE ch;
				3039
				3040	ch = Py_UNICODE_TOUPPER(*s);
				3041	if (ch != *s) {
				3042	status = 1;
				3043	*s = ch;
				3044	}
				3045	s++;
				3046	}
				3047
				3048	return status;
				3049	}
				3050
				3051	static
				3052	int fixlower(PyUnicodeObject *self)
				3053	{
				3054	int len = self->length;
				3055	Py_UNICODE *s = self->str;
				3056	int status = 0;
				3057
				3058	while (len-- > 0) {
				3059	register Py_UNICODE ch;
				3060
				3061	ch = Py_UNICODE_TOLOWER(*s);
				3062	if (ch != *s) {
				3063	status = 1;
				3064	*s = ch;
				3065	}
				3066	s++;
				3067	}
				3068
				3069	return status;
				3070	}
				3071
				3072	static
				3073	int fixswapcase(PyUnicodeObject *self)
				3074	{
				3075	int len = self->length;
				3076	Py_UNICODE *s = self->str;
				3077	int status = 0;
				3078
				3079	while (len-- > 0) {
				3080	if (Py_UNICODE_ISUPPER(*s)) {
				3081	s = Py_UNICODE_TOLOWER(s);
				3082	status = 1;
				3083	} else if (Py_UNICODE_ISLOWER(*s)) {
				3084	s = Py_UNICODE_TOUPPER(s);
				3085	status = 1;
				3086	}
				3087	s++;
				3088	}
				3089
				3090	return status;
				3091	}
				3092
				3093	static
				3094	int fixcapitalize(PyUnicodeObject *self)
				3095	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3096	int len = self->length;
				3097	Py_UNICODE *s = self->str;
				3098	int status = 0;
				3099
				3100	if (len == 0)
				3101	return 0;
				3102	if (Py_UNICODE_ISLOWER(*s)) {
				3103	s = Py_UNICODE_TOUPPER(s);
				3104	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3105	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3106	s++;
				3107	while (--len > 0) {
				3108	if (Py_UNICODE_ISUPPER(*s)) {
				3109	s = Py_UNICODE_TOLOWER(s);
				3110	status = 1;
				3111	}
				3112	s++;
				3113	}
				3114	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3115	}
				3116
				3117	static
				3118	int fixtitle(PyUnicodeObject *self)
				3119	{
				3120	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3121	register Py_UNICODE *e;
				3122	int previous_is_cased;
				3123
				3124	/* Shortcut for single character strings */
				3125	if (PyUnicode_GET_SIZE(self) == 1) {
				3126	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3127	if (*p != ch) {
				3128	*p = ch;
				3129	return 1;
				3130	}
				3131	else
				3132	return 0;
				3133	}
				3134
				3135	e = p + PyUnicode_GET_SIZE(self);
				3136	previous_is_cased = 0;
				3137	for (; p < e; p++) {
				3138	register const Py_UNICODE ch = *p;
				3139
				3140	if (previous_is_cased)
				3141	*p = Py_UNICODE_TOLOWER(ch);
				3142	else
				3143	*p = Py_UNICODE_TOTITLE(ch);
				3144
				3145	if (Py_UNICODE_ISLOWER(ch) \|\|
				3146	Py_UNICODE_ISUPPER(ch) \|\|
				3147	Py_UNICODE_ISTITLE(ch))
				3148	previous_is_cased = 1;
				3149	else
				3150	previous_is_cased = 0;
				3151	}
				3152	return 1;
				3153	}
				3154
				3155	PyObject PyUnicode_Join(PyObject separator,
				3156	PyObject *seq)
				3157	{
				3158	Py_UNICODE *sep;
				3159	int seplen;
				3160	PyUnicodeObject *res = NULL;
				3161	int reslen = 0;
				3162	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3163	int sz = 100;
				3164	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3165	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3166
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3167	it = PyObject_GetIter(seq);
				3168	if (it == NULL)
				3169	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3170
				3171	if (separator == NULL) {
				3172	Py_UNICODE blank = ' ';
				3173	sep = &blank;
				3174	seplen = 1;
				3175	}
				3176	else {
				3177	separator = PyUnicode_FromObject(separator);
				3178	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3179	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3180	sep = PyUnicode_AS_UNICODE(separator);
				3181	seplen = PyUnicode_GET_SIZE(separator);
				3182	}
				3183
				3184	res = _PyUnicode_New(sz);
				3185	if (res == NULL)
				3186	goto onError;
				3187	p = PyUnicode_AS_UNICODE(res);
				3188	reslen = 0;
				3189
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3190	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3191	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3192	PyObject *item = PyIter_Next(it);
				3193	if (item == NULL) {
				3194	if (PyErr_Occurred())
				3195	goto onError;
				3196	break;
				3197	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3198	if (!PyUnicode_Check(item)) {
				3199	PyObject *v;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3200	if (!PyString_Check(item)) {
				3201	PyErr_Format(PyExc_TypeError,
				3202	"sequence item %i: expected string or Unicode,"
				3203	" %.80s found",
				3204	i, item->ob_type->tp_name);
				3205	Py_DECREF(item);
				3206	goto onError;
				3207	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3208	v = PyUnicode_FromObject(item);
				3209	Py_DECREF(item);
				3210	item = v;
				3211	if (item == NULL)
				3212	goto onError;
				3213	}
				3214	itemlen = PyUnicode_GET_SIZE(item);
				3215	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3216	if (_PyUnicode_Resize(&res, sz*2)) {
				3217	Py_DECREF(item);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3218	goto onError;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3219	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3220	sz *= 2;
				3221	p = PyUnicode_AS_UNICODE(res) + reslen;
				3222	}
				3223	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3224	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3225	p += seplen;
				3226	reslen += seplen;
				3227	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3228	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3229	p += itemlen;
				3230	reslen += itemlen;
				3231	Py_DECREF(item);
				3232	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3233	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3234	goto onError;
				3235
				3236	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3237	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3238	return (PyObject *)res;
				3239
				3240	onError:
				3241	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3242	Py_XDECREF(res);
				3243	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3244	return NULL;
				3245	}
				3246
				3247	static
				3248	PyUnicodeObject pad(PyUnicodeObject self,
				3249	int left,
				3250	int right,
				3251	Py_UNICODE fill)
				3252	{
				3253	PyUnicodeObject *u;
				3254
				3255	if (left < 0)
				3256	left = 0;
				3257	if (right < 0)
				3258	right = 0;
				3259
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3260	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3261	Py_INCREF(self);
				3262	return self;
				3263	}
				3264
				3265	u = _PyUnicode_New(left + self->length + right);
				3266	if (u) {
				3267	if (left)
				3268	Py_UNICODE_FILL(u->str, fill, left);
				3269	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3270	if (right)
				3271	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3272	}
				3273
				3274	return u;
				3275	}
				3276
				3277	#define SPLIT_APPEND(data, left, right) \
				3278	str = PyUnicode_FromUnicode(data + left, right - left); \
				3279	if (!str) \
				3280	goto onError; \
				3281	if (PyList_Append(list, str)) { \
				3282	Py_DECREF(str); \
				3283	goto onError; \
				3284	} \
				3285	else \
				3286	Py_DECREF(str);
				3287
				3288	static
				3289	PyObject split_whitespace(PyUnicodeObject self,
				3290	PyObject *list,
				3291	int maxcount)
				3292	{
				3293	register int i;
				3294	register int j;
				3295	int len = self->length;
				3296	PyObject *str;
				3297
				3298	for (i = j = 0; i < len; ) {
				3299	/* find a token */
				3300	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3301	i++;
				3302	j = i;
				3303	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3304	i++;
				3305	if (j < i) {
				3306	if (maxcount-- <= 0)
				3307	break;
				3308	SPLIT_APPEND(self->str, j, i);
				3309	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3310	i++;
				3311	j = i;
				3312	}
				3313	}
				3314	if (j < len) {
				3315	SPLIT_APPEND(self->str, j, len);
				3316	}
				3317	return list;
				3318
				3319	onError:
				3320	Py_DECREF(list);
				3321	return NULL;
				3322	}
				3323
				3324	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3325	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3326	{
				3327	register int i;
				3328	register int j;
				3329	int len;
				3330	PyObject *list;
				3331	PyObject *str;
				3332	Py_UNICODE *data;
				3333
				3334	string = PyUnicode_FromObject(string);
				3335	if (string == NULL)
				3336	return NULL;
				3337	data = PyUnicode_AS_UNICODE(string);
				3338	len = PyUnicode_GET_SIZE(string);
				3339
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3340	list = PyList_New(0);
				3341	if (!list)
				3342	goto onError;
				3343
				3344	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3345	int eol;
				3346
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3347	/* Find a line and append it */
				3348	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3349	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3350
				3351	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3352	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3353	if (i < len) {
				3354	if (data[i] == '\r' && i + 1 < len &&
				3355	data[i+1] == '\n')
				3356	i += 2;
				3357	else
				3358	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3359	if (keepends)
				3360	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3361	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3362	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3363	j = i;
				3364	}
				3365	if (j < len) {
				3366	SPLIT_APPEND(data, j, len);
				3367	}
				3368
				3369	Py_DECREF(string);
				3370	return list;
				3371
				3372	onError:
				3373	Py_DECREF(list);
				3374	Py_DECREF(string);
				3375	return NULL;
				3376	}
				3377
				3378	static
				3379	PyObject split_char(PyUnicodeObject self,
				3380	PyObject *list,
				3381	Py_UNICODE ch,
				3382	int maxcount)
				3383	{
				3384	register int i;
				3385	register int j;
				3386	int len = self->length;
				3387	PyObject *str;
				3388
				3389	for (i = j = 0; i < len; ) {
				3390	if (self->str[i] == ch) {
				3391	if (maxcount-- <= 0)
				3392	break;
				3393	SPLIT_APPEND(self->str, j, i);
				3394	i = j = i + 1;
				3395	} else
				3396	i++;
				3397	}
				3398	if (j <= len) {
				3399	SPLIT_APPEND(self->str, j, len);
				3400	}
				3401	return list;
				3402
				3403	onError:
				3404	Py_DECREF(list);
				3405	return NULL;
				3406	}
				3407
				3408	static
				3409	PyObject split_substring(PyUnicodeObject self,
				3410	PyObject *list,
				3411	PyUnicodeObject *substring,
				3412	int maxcount)
				3413	{
				3414	register int i;
				3415	register int j;
				3416	int len = self->length;
				3417	int sublen = substring->length;
				3418	PyObject *str;
				3419
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3420	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3421	if (Py_UNICODE_MATCH(self, i, substring)) {
				3422	if (maxcount-- <= 0)
				3423	break;
				3424	SPLIT_APPEND(self->str, j, i);
				3425	i = j = i + sublen;
				3426	} else
				3427	i++;
				3428	}
				3429	if (j <= len) {
				3430	SPLIT_APPEND(self->str, j, len);
				3431	}
				3432	return list;
				3433
				3434	onError:
				3435	Py_DECREF(list);
				3436	return NULL;
				3437	}
				3438
				3439	#undef SPLIT_APPEND
				3440
				3441	static
				3442	PyObject split(PyUnicodeObject self,
				3443	PyUnicodeObject *substring,
				3444	int maxcount)
				3445	{
				3446	PyObject *list;
				3447
				3448	if (maxcount < 0)
				3449	maxcount = INT_MAX;
				3450
				3451	list = PyList_New(0);
				3452	if (!list)
				3453	return NULL;
				3454
				3455	if (substring == NULL)
				3456	return split_whitespace(self,list,maxcount);
				3457
				3458	else if (substring->length == 1)
				3459	return split_char(self,list,substring->str[0],maxcount);
				3460
				3461	else if (substring->length == 0) {
				3462	Py_DECREF(list);
				3463	PyErr_SetString(PyExc_ValueError, "empty separator");
				3464	return NULL;
				3465	}
				3466	else
				3467	return split_substring(self,list,substring,maxcount);
				3468	}
				3469
				3470	static
				3471	PyObject strip(PyUnicodeObject self,
				3472	int left,
				3473	int right)
				3474	{
				3475	Py_UNICODE *p = self->str;
				3476	int start = 0;
				3477	int end = self->length;
				3478
				3479	if (left)
				3480	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3481	start++;
				3482
				3483	if (right)
				3484	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3485	end--;
				3486
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3487	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3488	/* couldn't strip anything off, return original string */
				3489	Py_INCREF(self);
				3490	return (PyObject*) self;
				3491	}
				3492
				3493	return (PyObject*) PyUnicode_FromUnicode(
				3494	self->str + start,
				3495	end - start
				3496	);
				3497	}
				3498
				3499	static
				3500	PyObject replace(PyUnicodeObject self,
				3501	PyUnicodeObject *str1,
				3502	PyUnicodeObject *str2,
				3503	int maxcount)
				3504	{
				3505	PyUnicodeObject *u;
				3506
				3507	if (maxcount < 0)
				3508	maxcount = INT_MAX;
				3509
				3510	if (str1->length == 1 && str2->length == 1) {
				3511	int i;
				3512
				3513	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3514	if (!findchar(self->str, self->length, str1->str[0]) &&
				3515	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3516	/* nothing to replace, return original string */
				3517	Py_INCREF(self);
				3518	u = self;
				3519	} else {
				3520	Py_UNICODE u1 = str1->str[0];
				3521	Py_UNICODE u2 = str2->str[0];
				3522
				3523	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3524	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3525	self->length
				3526	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3527	if (u != NULL) {
				3528	Py_UNICODE_COPY(u->str, self->str,
				3529	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3530	for (i = 0; i < u->length; i++)
				3531	if (u->str[i] == u1) {
				3532	if (--maxcount < 0)
				3533	break;
				3534	u->str[i] = u2;
				3535	}
				3536	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3537	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3538
				3539	} else {
				3540	int n, i;
				3541	Py_UNICODE *p;
				3542
				3543	/* replace strings */
				3544	n = count(self, 0, self->length, str1);
				3545	if (n > maxcount)
				3546	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3547	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3548	/* nothing to replace, return original string */
				3549	Py_INCREF(self);
				3550	u = self;
				3551	} else {
				3552	u = _PyUnicode_New(
				3553	self->length + n * (str2->length - str1->length));
				3554	if (u) {
				3555	i = 0;
				3556	p = u->str;
				3557	while (i <= self->length - str1->length)
				3558	if (Py_UNICODE_MATCH(self, i, str1)) {
				3559	/* replace string segment */
				3560	Py_UNICODE_COPY(p, str2->str, str2->length);
				3561	p += str2->length;
				3562	i += str1->length;
				3563	if (--n <= 0) {
				3564	/* copy remaining part */
				3565	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3566	break;
				3567	}
				3568	} else
				3569	*p++ = self->str[i++];
				3570	}
				3571	}
				3572	}
				3573
				3574	return (PyObject *) u;
				3575	}
				3576
				3577	/* --- Unicode Object Methods --------------------------------------------- */
				3578
				3579	static char title__doc__[] =
				3580	"S.title() -> unicode\n\
				3581	\n\
				3582	Return a titlecased version of S, i.e. words start with title case\n\
				3583	characters, all remaining cased characters have lower case.";
				3584
				3585	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3586	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3587	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3588	return fixup(self, fixtitle);
				3589	}
				3590
				3591	static char capitalize__doc__[] =
				3592	"S.capitalize() -> unicode\n\
				3593	\n\
				3594	Return a capitalized version of S, i.e. make the first character\n\
				3595	have upper case.";
				3596
				3597	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3598	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3599	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3600	return fixup(self, fixcapitalize);
				3601	}
				3602
				3603	#if 0
				3604	static char capwords__doc__[] =
				3605	"S.capwords() -> unicode\n\
				3606	\n\
				3607	Apply .capitalize() to all words in S and return the result with\n\
				3608	normalized whitespace (all whitespace strings are replaced by ' ').";
				3609
				3610	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3611	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3612	{
				3613	PyObject *list;
				3614	PyObject *item;
				3615	int i;
				3616
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3617	/* Split into words */
				3618	list = split(self, NULL, -1);
				3619	if (!list)
				3620	return NULL;
				3621
				3622	/* Capitalize each word */
				3623	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3624	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3625	fixcapitalize);
				3626	if (item == NULL)
				3627	goto onError;
				3628	Py_DECREF(PyList_GET_ITEM(list, i));
				3629	PyList_SET_ITEM(list, i, item);
				3630	}
				3631
				3632	/* Join the words to form a new string */
				3633	item = PyUnicode_Join(NULL, list);
				3634
				3635	onError:
				3636	Py_DECREF(list);
				3637	return (PyObject *)item;
				3638	}
				3639	#endif
				3640
				3641	static char center__doc__[] =
				3642	"S.center(width) -> unicode\n\
				3643	\n\
				3644	Return S centered in a Unicode string of length width. Padding is done\n\
				3645	using spaces.";
				3646
				3647	static PyObject *
				3648	unicode_center(PyUnicodeObject self, PyObject args)
				3649	{
				3650	int marg, left;
				3651	int width;
				3652
				3653	if (!PyArg_ParseTuple(args, "i:center", &width))
				3654	return NULL;
				3655
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3656	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3657	Py_INCREF(self);
				3658	return (PyObject*) self;
				3659	}
				3660
				3661	marg = width - self->length;
				3662	left = marg / 2 + (marg & width & 1);
				3663
				3664	return (PyObject*) pad(self, left, marg - left, ' ');
				3665	}
				3666
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3667	#if 0
				3668
				3669	/* This code should go into some future Unicode collation support
				3670	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3671	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3672
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3673	/* speedy UTF-16 code point order comparison */
				3674	/* gleaned from: */
				3675	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3676
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3677	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3678	{
				3679	0, 0, 0, 0, 0, 0, 0, 0,
				3680	0, 0, 0, 0, 0, 0, 0, 0,
				3681	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3682	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3683	};
				3684
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3685	static int
				3686	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3687	{
				3688	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3689
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3690	Py_UNICODE *s1 = str1->str;
				3691	Py_UNICODE *s2 = str2->str;
				3692
				3693	len1 = str1->length;
				3694	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3695
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3696	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3697	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3698
				3699	c1 = *s1++;
				3700	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3701
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3702	if (c1 > (1<<11) * 26)
				3703	c1 += utf16Fixup[c1>>11];
				3704	if (c2 > (1<<11) * 26)
				3705	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3706	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3707
				3708	if (c1 != c2)
				3709	return (c1 < c2) ? -1 : 1;
				3710
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3711	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3712	}
				3713
				3714	return (len1 < len2) ? -1 : (len1 != len2);
				3715	}
				3716
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3717	#else
				3718
				3719	static int
				3720	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3721	{
				3722	register int len1, len2;
				3723
				3724	Py_UNICODE *s1 = str1->str;
				3725	Py_UNICODE *s2 = str2->str;
				3726
				3727	len1 = str1->length;
				3728	len2 = str2->length;
				3729
				3730	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3731	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3732
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3733	c1 = *s1++;
				3734	c2 = *s2++;
				3735
				3736	if (c1 != c2)
				3737	return (c1 < c2) ? -1 : 1;
				3738
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3739	len1--; len2--;
				3740	}
				3741
				3742	return (len1 < len2) ? -1 : (len1 != len2);
				3743	}
				3744
				3745	#endif
				3746
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3747	int PyUnicode_Compare(PyObject *left,
				3748	PyObject *right)
				3749	{
				3750	PyUnicodeObject u = NULL, v = NULL;
				3751	int result;
				3752
				3753	/* Coerce the two arguments */
				3754	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3755	if (u == NULL)
				3756	goto onError;
				3757	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3758	if (v == NULL)
				3759	goto onError;
				3760
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3761	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3762	if (v == u) {
				3763	Py_DECREF(u);
				3764	Py_DECREF(v);
				3765	return 0;
				3766	}
				3767
				3768	result = unicode_compare(u, v);
				3769
				3770	Py_DECREF(u);
				3771	Py_DECREF(v);
				3772	return result;
				3773
				3774	onError:
				3775	Py_XDECREF(u);
				3776	Py_XDECREF(v);
				3777	return -1;
				3778	}
				3779
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3780	int PyUnicode_Contains(PyObject *container,
				3781	PyObject *element)
				3782	{
				3783	PyUnicodeObject u = NULL, v = NULL;
				3784	int result;
				3785	register const Py_UNICODE p, e;
				3786	register Py_UNICODE ch;
				3787
				3788	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3789	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3790	if (v == NULL) {
				3791	PyErr_SetString(PyExc_TypeError,
				3792	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3793	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3794	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3795	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3796	if (u == NULL) {
				3797	Py_DECREF(v);
				3798	goto onError;
				3799	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3800
				3801	/* Check v in u */
				3802	if (PyUnicode_GET_SIZE(v) != 1) {
				3803	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3804	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3805	goto onError;
				3806	}
				3807	ch = *PyUnicode_AS_UNICODE(v);
				3808	p = PyUnicode_AS_UNICODE(u);
				3809	e = p + PyUnicode_GET_SIZE(u);
				3810	result = 0;
				3811	while (p < e) {
				3812	if (*p++ == ch) {
				3813	result = 1;
				3814	break;
				3815	}
				3816	}
				3817
				3818	Py_DECREF(u);
				3819	Py_DECREF(v);
				3820	return result;
				3821
				3822	onError:
				3823	Py_XDECREF(u);
				3824	Py_XDECREF(v);
				3825	return -1;
				3826	}
				3827
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3828	/* Concat to string or Unicode object giving a new Unicode object. */
				3829
				3830	PyObject PyUnicode_Concat(PyObject left,
				3831	PyObject *right)
				3832	{
				3833	PyUnicodeObject u = NULL, v = NULL, *w;
				3834
				3835	/* Coerce the two arguments */
				3836	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3837	if (u == NULL)
				3838	goto onError;
				3839	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3840	if (v == NULL)
				3841	goto onError;
				3842
				3843	/* Shortcuts */
				3844	if (v == unicode_empty) {
				3845	Py_DECREF(v);
				3846	return (PyObject *)u;
				3847	}
				3848	if (u == unicode_empty) {
				3849	Py_DECREF(u);
				3850	return (PyObject *)v;
				3851	}
				3852
				3853	/* Concat the two Unicode strings */
				3854	w = _PyUnicode_New(u->length + v->length);
				3855	if (w == NULL)
				3856	goto onError;
				3857	Py_UNICODE_COPY(w->str, u->str, u->length);
				3858	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3859
				3860	Py_DECREF(u);
				3861	Py_DECREF(v);
				3862	return (PyObject *)w;
				3863
				3864	onError:
				3865	Py_XDECREF(u);
				3866	Py_XDECREF(v);
				3867	return NULL;
				3868	}
				3869
				3870	static char count__doc__[] =
				3871	"S.count(sub[, start[, end]]) -> int\n\
				3872	\n\
				3873	Return the number of occurrences of substring sub in Unicode string\n\
				3874	S[start:end]. Optional arguments start and end are\n\
				3875	interpreted as in slice notation.";
				3876
				3877	static PyObject *
				3878	unicode_count(PyUnicodeObject self, PyObject args)
				3879	{
				3880	PyUnicodeObject *substring;
				3881	int start = 0;
				3882	int end = INT_MAX;
				3883	PyObject *result;
				3884
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3885	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3886	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3887	return NULL;
				3888
				3889	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3890	(PyObject *)substring);
				3891	if (substring == NULL)
				3892	return NULL;
				3893
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3894	if (start < 0)
				3895	start += self->length;
				3896	if (start < 0)
				3897	start = 0;
				3898	if (end > self->length)
				3899	end = self->length;
				3900	if (end < 0)
				3901	end += self->length;
				3902	if (end < 0)
				3903	end = 0;
				3904
				3905	result = PyInt_FromLong((long) count(self, start, end, substring));
				3906
				3907	Py_DECREF(substring);
				3908	return result;
				3909	}
				3910
				3911	static char encode__doc__[] =
				3912	"S.encode([encoding[,errors]]) -> string\n\
				3913	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3914	Return an encoded string version of S. Default encoding is the current\n\
				3915	default string encoding. errors may be given to set a different error\n\
				3916	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3917	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3918
				3919	static PyObject *
				3920	unicode_encode(PyUnicodeObject self, PyObject args)
				3921	{
				3922	char *encoding = NULL;
				3923	char *errors = NULL;
				3924	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3925	return NULL;
				3926	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3927	}
				3928
				3929	static char expandtabs__doc__[] =
				3930	"S.expandtabs([tabsize]) -> unicode\n\
				3931	\n\
				3932	Return a copy of S where all tab characters are expanded using spaces.\n\
				3933	If tabsize is not given, a tab size of 8 characters is assumed.";
				3934
				3935	static PyObject*
				3936	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3937	{
				3938	Py_UNICODE *e;
				3939	Py_UNICODE *p;
				3940	Py_UNICODE *q;
				3941	int i, j;
				3942	PyUnicodeObject *u;
				3943	int tabsize = 8;
				3944
				3945	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3946	return NULL;
				3947
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3948	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3949	i = j = 0;
				3950	e = self->str + self->length;
				3951	for (p = self->str; p < e; p++)
				3952	if (*p == '\t') {
				3953	if (tabsize > 0)
				3954	j += tabsize - (j % tabsize);
				3955	}
				3956	else {
				3957	j++;
				3958	if (p == '\n' \|\| p == '\r') {
				3959	i += j;
				3960	j = 0;
				3961	}
				3962	}
				3963
				3964	/* Second pass: create output string and fill it */
				3965	u = _PyUnicode_New(i + j);
				3966	if (!u)
				3967	return NULL;
				3968
				3969	j = 0;
				3970	q = u->str;
				3971
				3972	for (p = self->str; p < e; p++)
				3973	if (*p == '\t') {
				3974	if (tabsize > 0) {
				3975	i = tabsize - (j % tabsize);
				3976	j += i;
				3977	while (i--)
				3978	*q++ = ' ';
				3979	}
				3980	}
				3981	else {
				3982	j++;
				3983	q++ = p;
				3984	if (p == '\n' \|\| p == '\r')
				3985	j = 0;
				3986	}
				3987
				3988	return (PyObject*) u;
				3989	}
				3990
				3991	static char find__doc__[] =
				3992	"S.find(sub [,start [,end]]) -> int\n\
				3993	\n\
				3994	Return the lowest index in S where substring sub is found,\n\
				3995	such that sub is contained within s[start,end]. Optional\n\
				3996	arguments start and end are interpreted as in slice notation.\n\
				3997	\n\
				3998	Return -1 on failure.";
				3999
				4000	static PyObject *
				4001	unicode_find(PyUnicodeObject self, PyObject args)
				4002	{
				4003	PyUnicodeObject *substring;
				4004	int start = 0;
				4005	int end = INT_MAX;
				4006	PyObject *result;
				4007
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4008	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				4009	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4010	return NULL;
				4011	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4012	(PyObject *)substring);
				4013	if (substring == NULL)
				4014	return NULL;
				4015
				4016	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				4017
				4018	Py_DECREF(substring);
				4019	return result;
				4020	}
				4021
				4022	static PyObject *
				4023	unicode_getitem(PyUnicodeObject *self, int index)
				4024	{
				4025	if (index < 0 \|\| index >= self->length) {
				4026	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4027	return NULL;
				4028	}
				4029
				4030	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4031	}
				4032
				4033	static long
				4034	unicode_hash(PyUnicodeObject *self)
				4035	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4036	/* Since Unicode objects compare equal to their ASCII string
				4037	counterparts, they should use the individual character values
				4038	as basis for their hash value. This is needed to assure that
				4039	strings and Unicode objects behave in the same way as
				4040	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4041
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4042	register int len;
				4043	register Py_UNICODE *p;
				4044	register long x;
				4045
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4046	if (self->hash != -1)
				4047	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4048	len = PyUnicode_GET_SIZE(self);
				4049	p = PyUnicode_AS_UNICODE(self);
				4050	x = *p << 7;
				4051	while (--len >= 0)
				4052	x = (1000003x) ^ p++;
				4053	x ^= PyUnicode_GET_SIZE(self);
				4054	if (x == -1)
				4055	x = -2;
				4056	self->hash = x;
				4057	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4058	}
				4059
				4060	static char index__doc__[] =
				4061	"S.index(sub [,start [,end]]) -> int\n\
				4062	\n\
				4063	Like S.find() but raise ValueError when the substring is not found.";
				4064
				4065	static PyObject *
				4066	unicode_index(PyUnicodeObject self, PyObject args)
				4067	{
				4068	int result;
				4069	PyUnicodeObject *substring;
				4070	int start = 0;
				4071	int end = INT_MAX;
				4072
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4073	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4074	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4075	return NULL;
				4076
				4077	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4078	(PyObject *)substring);
				4079	if (substring == NULL)
				4080	return NULL;
				4081
				4082	result = findstring(self, substring, start, end, 1);
				4083
				4084	Py_DECREF(substring);
				4085	if (result < 0) {
				4086	PyErr_SetString(PyExc_ValueError, "substring not found");
				4087	return NULL;
				4088	}
				4089	return PyInt_FromLong(result);
				4090	}
				4091
				4092	static char islower__doc__[] =
				4093	"S.islower() -> int\n\
				4094	\n\
				4095	Return 1 if all cased characters in S are lowercase and there is\n\
				4096	at least one cased character in S, 0 otherwise.";
				4097
				4098	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4099	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4100	{
				4101	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4102	register const Py_UNICODE *e;
				4103	int cased;
				4104
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4105	/* Shortcut for single character strings */
				4106	if (PyUnicode_GET_SIZE(self) == 1)
				4107	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				4108
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4109	/* Special case for empty strings */
				4110	if (PyString_GET_SIZE(self) == 0)
				4111	return PyInt_FromLong(0);
				4112
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4113	e = p + PyUnicode_GET_SIZE(self);
				4114	cased = 0;
				4115	for (; p < e; p++) {
				4116	register const Py_UNICODE ch = *p;
				4117
				4118	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4119	return PyInt_FromLong(0);
				4120	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4121	cased = 1;
				4122	}
				4123	return PyInt_FromLong(cased);
				4124	}
				4125
				4126	static char isupper__doc__[] =
				4127	"S.isupper() -> int\n\
				4128	\n\
				4129	Return 1 if all cased characters in S are uppercase and there is\n\
				4130	at least one cased character in S, 0 otherwise.";
				4131
				4132	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4133	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4134	{
				4135	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4136	register const Py_UNICODE *e;
				4137	int cased;
				4138
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4139	/* Shortcut for single character strings */
				4140	if (PyUnicode_GET_SIZE(self) == 1)
				4141	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				4142
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4143	/* Special case for empty strings */
				4144	if (PyString_GET_SIZE(self) == 0)
				4145	return PyInt_FromLong(0);
				4146
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4147	e = p + PyUnicode_GET_SIZE(self);
				4148	cased = 0;
				4149	for (; p < e; p++) {
				4150	register const Py_UNICODE ch = *p;
				4151
				4152	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4153	return PyInt_FromLong(0);
				4154	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4155	cased = 1;
				4156	}
				4157	return PyInt_FromLong(cased);
				4158	}
				4159
				4160	static char istitle__doc__[] =
				4161	"S.istitle() -> int\n\
				4162	\n\
				4163	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				4164	may only follow uncased characters and lowercase characters only cased\n\
				4165	ones. Return 0 otherwise.";
				4166
				4167	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4168	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4169	{
				4170	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4171	register const Py_UNICODE *e;
				4172	int cased, previous_is_cased;
				4173
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4174	/* Shortcut for single character strings */
				4175	if (PyUnicode_GET_SIZE(self) == 1)
				4176	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4177	(Py_UNICODE_ISUPPER(*p) != 0));
				4178
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4179	/* Special case for empty strings */
				4180	if (PyString_GET_SIZE(self) == 0)
				4181	return PyInt_FromLong(0);
				4182
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4183	e = p + PyUnicode_GET_SIZE(self);
				4184	cased = 0;
				4185	previous_is_cased = 0;
				4186	for (; p < e; p++) {
				4187	register const Py_UNICODE ch = *p;
				4188
				4189	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4190	if (previous_is_cased)
				4191	return PyInt_FromLong(0);
				4192	previous_is_cased = 1;
				4193	cased = 1;
				4194	}
				4195	else if (Py_UNICODE_ISLOWER(ch)) {
				4196	if (!previous_is_cased)
				4197	return PyInt_FromLong(0);
				4198	previous_is_cased = 1;
				4199	cased = 1;
				4200	}
				4201	else
				4202	previous_is_cased = 0;
				4203	}
				4204	return PyInt_FromLong(cased);
				4205	}
				4206
				4207	static char isspace__doc__[] =
				4208	"S.isspace() -> int\n\
				4209	\n\
				4210	Return 1 if there are only whitespace characters in S,\n\
				4211	0 otherwise.";
				4212
				4213	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4214	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4215	{
				4216	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4217	register const Py_UNICODE *e;
				4218
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4219	/* Shortcut for single character strings */
				4220	if (PyUnicode_GET_SIZE(self) == 1 &&
				4221	Py_UNICODE_ISSPACE(*p))
				4222	return PyInt_FromLong(1);
				4223
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4224	/* Special case for empty strings */
				4225	if (PyString_GET_SIZE(self) == 0)
				4226	return PyInt_FromLong(0);
				4227
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4228	e = p + PyUnicode_GET_SIZE(self);
				4229	for (; p < e; p++) {
				4230	if (!Py_UNICODE_ISSPACE(*p))
				4231	return PyInt_FromLong(0);
				4232	}
				4233	return PyInt_FromLong(1);
				4234	}
				4235
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4236	static char isalpha__doc__[] =
				4237	"S.isalpha() -> int\n\
				4238	\n\
				4239	Return 1 if all characters in S are alphabetic\n\
				4240	and there is at least one character in S, 0 otherwise.";
				4241
				4242	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4243	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4244	{
				4245	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4246	register const Py_UNICODE *e;
				4247
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4248	/* Shortcut for single character strings */
				4249	if (PyUnicode_GET_SIZE(self) == 1 &&
				4250	Py_UNICODE_ISALPHA(*p))
				4251	return PyInt_FromLong(1);
				4252
				4253	/* Special case for empty strings */
				4254	if (PyString_GET_SIZE(self) == 0)
				4255	return PyInt_FromLong(0);
				4256
				4257	e = p + PyUnicode_GET_SIZE(self);
				4258	for (; p < e; p++) {
				4259	if (!Py_UNICODE_ISALPHA(*p))
				4260	return PyInt_FromLong(0);
				4261	}
				4262	return PyInt_FromLong(1);
				4263	}
				4264
				4265	static char isalnum__doc__[] =
				4266	"S.isalnum() -> int\n\
				4267	\n\
				4268	Return 1 if all characters in S are alphanumeric\n\
				4269	and there is at least one character in S, 0 otherwise.";
				4270
				4271	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4272	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4273	{
				4274	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4275	register const Py_UNICODE *e;
				4276
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4277	/* Shortcut for single character strings */
				4278	if (PyUnicode_GET_SIZE(self) == 1 &&
				4279	Py_UNICODE_ISALNUM(*p))
				4280	return PyInt_FromLong(1);
				4281
				4282	/* Special case for empty strings */
				4283	if (PyString_GET_SIZE(self) == 0)
				4284	return PyInt_FromLong(0);
				4285
				4286	e = p + PyUnicode_GET_SIZE(self);
				4287	for (; p < e; p++) {
				4288	if (!Py_UNICODE_ISALNUM(*p))
				4289	return PyInt_FromLong(0);
				4290	}
				4291	return PyInt_FromLong(1);
				4292	}
				4293
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4294	static char isdecimal__doc__[] =
				4295	"S.isdecimal() -> int\n\
				4296	\n\
				4297	Return 1 if there are only decimal characters in S,\n\
				4298	0 otherwise.";
				4299
				4300	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4301	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4302	{
				4303	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4304	register const Py_UNICODE *e;
				4305
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4306	/* Shortcut for single character strings */
				4307	if (PyUnicode_GET_SIZE(self) == 1 &&
				4308	Py_UNICODE_ISDECIMAL(*p))
				4309	return PyInt_FromLong(1);
				4310
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4311	/* Special case for empty strings */
				4312	if (PyString_GET_SIZE(self) == 0)
				4313	return PyInt_FromLong(0);
				4314
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4315	e = p + PyUnicode_GET_SIZE(self);
				4316	for (; p < e; p++) {
				4317	if (!Py_UNICODE_ISDECIMAL(*p))
				4318	return PyInt_FromLong(0);
				4319	}
				4320	return PyInt_FromLong(1);
				4321	}
				4322
				4323	static char isdigit__doc__[] =
				4324	"S.isdigit() -> int\n\
				4325	\n\
				4326	Return 1 if there are only digit characters in S,\n\
				4327	0 otherwise.";
				4328
				4329	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4330	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4331	{
				4332	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4333	register const Py_UNICODE *e;
				4334
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4335	/* Shortcut for single character strings */
				4336	if (PyUnicode_GET_SIZE(self) == 1 &&
				4337	Py_UNICODE_ISDIGIT(*p))
				4338	return PyInt_FromLong(1);
				4339
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4340	/* Special case for empty strings */
				4341	if (PyString_GET_SIZE(self) == 0)
				4342	return PyInt_FromLong(0);
				4343
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4344	e = p + PyUnicode_GET_SIZE(self);
				4345	for (; p < e; p++) {
				4346	if (!Py_UNICODE_ISDIGIT(*p))
				4347	return PyInt_FromLong(0);
				4348	}
				4349	return PyInt_FromLong(1);
				4350	}
				4351
				4352	static char isnumeric__doc__[] =
				4353	"S.isnumeric() -> int\n\
				4354	\n\
				4355	Return 1 if there are only numeric characters in S,\n\
				4356	0 otherwise.";
				4357
				4358	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4359	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4360	{
				4361	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4362	register const Py_UNICODE *e;
				4363
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4364	/* Shortcut for single character strings */
				4365	if (PyUnicode_GET_SIZE(self) == 1 &&
				4366	Py_UNICODE_ISNUMERIC(*p))
				4367	return PyInt_FromLong(1);
				4368
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4369	/* Special case for empty strings */
				4370	if (PyString_GET_SIZE(self) == 0)
				4371	return PyInt_FromLong(0);
				4372
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4373	e = p + PyUnicode_GET_SIZE(self);
				4374	for (; p < e; p++) {
				4375	if (!Py_UNICODE_ISNUMERIC(*p))
				4376	return PyInt_FromLong(0);
				4377	}
				4378	return PyInt_FromLong(1);
				4379	}
				4380
				4381	static char join__doc__[] =
				4382	"S.join(sequence) -> unicode\n\
				4383	\n\
				4384	Return a string which is the concatenation of the strings in the\n\
				4385	sequence. The separator between elements is S.";
				4386
				4387	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4388	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4389	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4390	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4391	}
				4392
				4393	static int
				4394	unicode_length(PyUnicodeObject *self)
				4395	{
				4396	return self->length;
				4397	}
				4398
				4399	static char ljust__doc__[] =
				4400	"S.ljust(width) -> unicode\n\
				4401	\n\
				4402	Return S left justified in a Unicode string of length width. Padding is\n\
				4403	done using spaces.";
				4404
				4405	static PyObject *
				4406	unicode_ljust(PyUnicodeObject self, PyObject args)
				4407	{
				4408	int width;
				4409	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4410	return NULL;
				4411
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4412	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4413	Py_INCREF(self);
				4414	return (PyObject*) self;
				4415	}
				4416
				4417	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4418	}
				4419
				4420	static char lower__doc__[] =
				4421	"S.lower() -> unicode\n\
				4422	\n\
				4423	Return a copy of the string S converted to lowercase.";
				4424
				4425	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4426	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4427	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4428	return fixup(self, fixlower);
				4429	}
				4430
				4431	static char lstrip__doc__[] =
				4432	"S.lstrip() -> unicode\n\
				4433	\n\
				4434	Return a copy of the string S with leading whitespace removed.";
				4435
				4436	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4437	unicode_lstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4438	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4439	return strip(self, 1, 0);
				4440	}
				4441
				4442	static PyObject*
				4443	unicode_repeat(PyUnicodeObject *str, int len)
				4444	{
				4445	PyUnicodeObject *u;
				4446	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4447	int nchars;
				4448	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4449
				4450	if (len < 0)
				4451	len = 0;
				4452
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4453	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4454	/* no repeat, return original string */
				4455	Py_INCREF(str);
				4456	return (PyObject*) str;
				4457	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4458
				4459	/* ensure # of chars needed doesn't overflow int and # of bytes
				4460	* needed doesn't overflow size_t
				4461	*/
				4462	nchars = len * str->length;
				4463	if (len && nchars / len != str->length) {
				4464	PyErr_SetString(PyExc_OverflowError,
				4465	"repeated string is too long");
				4466	return NULL;
				4467	}
				4468	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4469	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4470	PyErr_SetString(PyExc_OverflowError,
				4471	"repeated string is too long");
				4472	return NULL;
				4473	}
				4474	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4475	if (!u)
				4476	return NULL;
				4477
				4478	p = u->str;
				4479
				4480	while (len-- > 0) {
				4481	Py_UNICODE_COPY(p, str->str, str->length);
				4482	p += str->length;
				4483	}
				4484
				4485	return (PyObject*) u;
				4486	}
				4487
				4488	PyObject PyUnicode_Replace(PyObject obj,
				4489	PyObject *subobj,
				4490	PyObject *replobj,
				4491	int maxcount)
				4492	{
				4493	PyObject *self;
				4494	PyObject *str1;
				4495	PyObject *str2;
				4496	PyObject *result;
				4497
				4498	self = PyUnicode_FromObject(obj);
				4499	if (self == NULL)
				4500	return NULL;
				4501	str1 = PyUnicode_FromObject(subobj);
				4502	if (str1 == NULL) {
				4503	Py_DECREF(self);
				4504	return NULL;
				4505	}
				4506	str2 = PyUnicode_FromObject(replobj);
				4507	if (str2 == NULL) {
				4508	Py_DECREF(self);
				4509	Py_DECREF(str1);
				4510	return NULL;
				4511	}
				4512	result = replace((PyUnicodeObject *)self,
				4513	(PyUnicodeObject *)str1,
				4514	(PyUnicodeObject *)str2,
				4515	maxcount);
				4516	Py_DECREF(self);
				4517	Py_DECREF(str1);
				4518	Py_DECREF(str2);
				4519	return result;
				4520	}
				4521
				4522	static char replace__doc__[] =
				4523	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4524	\n\
				4525	Return a copy of S with all occurrences of substring\n\
				4526	old replaced by new. If the optional argument maxsplit is\n\
				4527	given, only the first maxsplit occurrences are replaced.";
				4528
				4529	static PyObject*
				4530	unicode_replace(PyUnicodeObject self, PyObject args)
				4531	{
				4532	PyUnicodeObject *str1;
				4533	PyUnicodeObject *str2;
				4534	int maxcount = -1;
				4535	PyObject *result;
				4536
				4537	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4538	return NULL;
				4539	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4540	if (str1 == NULL)
				4541	return NULL;
				4542	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4543	if (str2 == NULL)
				4544	return NULL;
				4545
				4546	result = replace(self, str1, str2, maxcount);
				4547
				4548	Py_DECREF(str1);
				4549	Py_DECREF(str2);
				4550	return result;
				4551	}
				4552
				4553	static
				4554	PyObject unicode_repr(PyObject unicode)
				4555	{
				4556	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4557	PyUnicode_GET_SIZE(unicode),
				4558	1);
				4559	}
				4560
				4561	static char rfind__doc__[] =
				4562	"S.rfind(sub [,start [,end]]) -> int\n\
				4563	\n\
				4564	Return the highest index in S where substring sub is found,\n\
				4565	such that sub is contained within s[start,end]. Optional\n\
				4566	arguments start and end are interpreted as in slice notation.\n\
				4567	\n\
				4568	Return -1 on failure.";
				4569
				4570	static PyObject *
				4571	unicode_rfind(PyUnicodeObject self, PyObject args)
				4572	{
				4573	PyUnicodeObject *substring;
				4574	int start = 0;
				4575	int end = INT_MAX;
				4576	PyObject *result;
				4577
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4578	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4579	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4580	return NULL;
				4581	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4582	(PyObject *)substring);
				4583	if (substring == NULL)
				4584	return NULL;
				4585
				4586	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4587
				4588	Py_DECREF(substring);
				4589	return result;
				4590	}
				4591
				4592	static char rindex__doc__[] =
				4593	"S.rindex(sub [,start [,end]]) -> int\n\
				4594	\n\
				4595	Like S.rfind() but raise ValueError when the substring is not found.";
				4596
				4597	static PyObject *
				4598	unicode_rindex(PyUnicodeObject self, PyObject args)
				4599	{
				4600	int result;
				4601	PyUnicodeObject *substring;
				4602	int start = 0;
				4603	int end = INT_MAX;
				4604
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4605	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4606	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4607	return NULL;
				4608	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4609	(PyObject *)substring);
				4610	if (substring == NULL)
				4611	return NULL;
				4612
				4613	result = findstring(self, substring, start, end, -1);
				4614
				4615	Py_DECREF(substring);
				4616	if (result < 0) {
				4617	PyErr_SetString(PyExc_ValueError, "substring not found");
				4618	return NULL;
				4619	}
				4620	return PyInt_FromLong(result);
				4621	}
				4622
				4623	static char rjust__doc__[] =
				4624	"S.rjust(width) -> unicode\n\
				4625	\n\
				4626	Return S right justified in a Unicode string of length width. Padding is\n\
				4627	done using spaces.";
				4628
				4629	static PyObject *
				4630	unicode_rjust(PyUnicodeObject self, PyObject args)
				4631	{
				4632	int width;
				4633	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4634	return NULL;
				4635
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4636	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4637	Py_INCREF(self);
				4638	return (PyObject*) self;
				4639	}
				4640
				4641	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4642	}
				4643
				4644	static char rstrip__doc__[] =
				4645	"S.rstrip() -> unicode\n\
				4646	\n\
				4647	Return a copy of the string S with trailing whitespace removed.";
				4648
				4649	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4650	unicode_rstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4651	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4652	return strip(self, 0, 1);
				4653	}
				4654
				4655	static PyObject*
				4656	unicode_slice(PyUnicodeObject *self, int start, int end)
				4657	{
				4658	/* standard clamping */
				4659	if (start < 0)
				4660	start = 0;
				4661	if (end < 0)
				4662	end = 0;
				4663	if (end > self->length)
				4664	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4665	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4666	/* full slice, return original string */
				4667	Py_INCREF(self);
				4668	return (PyObject*) self;
				4669	}
				4670	if (start > end)
				4671	start = end;
				4672	/* copy slice */
				4673	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4674	end - start);
				4675	}
				4676
				4677	PyObject PyUnicode_Split(PyObject s,
				4678	PyObject *sep,
				4679	int maxsplit)
				4680	{
				4681	PyObject *result;
				4682
				4683	s = PyUnicode_FromObject(s);
				4684	if (s == NULL)
				4685	return NULL;
				4686	if (sep != NULL) {
				4687	sep = PyUnicode_FromObject(sep);
				4688	if (sep == NULL) {
				4689	Py_DECREF(s);
				4690	return NULL;
				4691	}
				4692	}
				4693
				4694	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4695
				4696	Py_DECREF(s);
				4697	Py_XDECREF(sep);
				4698	return result;
				4699	}
				4700
				4701	static char split__doc__[] =
				4702	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4703	\n\
				4704	Return a list of the words in S, using sep as the\n\
				4705	delimiter string. If maxsplit is given, at most maxsplit\n\
				4706	splits are done. If sep is not specified, any whitespace string\n\
				4707	is a separator.";
				4708
				4709	static PyObject*
				4710	unicode_split(PyUnicodeObject self, PyObject args)
				4711	{
				4712	PyObject *substring = Py_None;
				4713	int maxcount = -1;
				4714
				4715	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4716	return NULL;
				4717
				4718	if (substring == Py_None)
				4719	return split(self, NULL, maxcount);
				4720	else if (PyUnicode_Check(substring))
				4721	return split(self, (PyUnicodeObject *)substring, maxcount);
				4722	else
				4723	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4724	}
				4725
				4726	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4727	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4728	\n\
				4729	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4730	Line breaks are not included in the resulting list unless keepends\n\
				4731	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4732
				4733	static PyObject*
				4734	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4735	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4736	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4737
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4738	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4739	return NULL;
				4740
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4741	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4742	}
				4743
				4744	static
				4745	PyObject unicode_str(PyUnicodeObject self)
				4746	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4747	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4748	}
				4749
				4750	static char strip__doc__[] =
				4751	"S.strip() -> unicode\n\
				4752	\n\
				4753	Return a copy of S with leading and trailing whitespace removed.";
				4754
				4755	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4756	unicode_strip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4757	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4758	return strip(self, 1, 1);
				4759	}
				4760
				4761	static char swapcase__doc__[] =
				4762	"S.swapcase() -> unicode\n\
				4763	\n\
				4764	Return a copy of S with uppercase characters converted to lowercase\n\
				4765	and vice versa.";
				4766
				4767	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4768	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4769	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4770	return fixup(self, fixswapcase);
				4771	}
				4772
				4773	static char translate__doc__[] =
				4774	"S.translate(table) -> unicode\n\
				4775	\n\
				4776	Return a copy of the string S, where all characters have been mapped\n\
				4777	through the given translation table, which must be a mapping of\n\
				4778	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4779	are left untouched. Characters mapped to None are deleted.";
				4780
				4781	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4782	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4783	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4784	return PyUnicode_TranslateCharmap(self->str,
				4785	self->length,
				4786	table,
				4787	"ignore");
				4788	}
				4789
				4790	static char upper__doc__[] =
				4791	"S.upper() -> unicode\n\
				4792	\n\
				4793	Return a copy of S converted to uppercase.";
				4794
				4795	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4796	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4797	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4798	return fixup(self, fixupper);
				4799	}
				4800
				4801	#if 0
				4802	static char zfill__doc__[] =
				4803	"S.zfill(width) -> unicode\n\
				4804	\n\
				4805	Pad a numeric string x with zeros on the left, to fill a field\n\
				4806	of the specified width. The string x is never truncated.";
				4807
				4808	static PyObject *
				4809	unicode_zfill(PyUnicodeObject self, PyObject args)
				4810	{
				4811	int fill;
				4812	PyUnicodeObject *u;
				4813
				4814	int width;
				4815	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4816	return NULL;
				4817
				4818	if (self->length >= width) {
				4819	Py_INCREF(self);
				4820	return (PyObject*) self;
				4821	}
				4822
				4823	fill = width - self->length;
				4824
				4825	u = pad(self, fill, 0, '0');
				4826
				4827	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4828	/* move sign to beginning of string */
				4829	u->str[0] = u->str[fill];
				4830	u->str[fill] = '0';
				4831	}
				4832
				4833	return (PyObject*) u;
				4834	}
				4835	#endif
				4836
				4837	#if 0
				4838	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4839	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4840	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4841	return PyInt_FromLong(unicode_freelist_size);
				4842	}
				4843	#endif
				4844
				4845	static char startswith__doc__[] =
				4846	"S.startswith(prefix[, start[, end]]) -> int\n\
				4847	\n\
				4848	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4849	optional start, test S beginning at that position. With optional end, stop\n\
				4850	comparing S at that position.";
				4851
				4852	static PyObject *
				4853	unicode_startswith(PyUnicodeObject *self,
				4854	PyObject *args)
				4855	{
				4856	PyUnicodeObject *substring;
				4857	int start = 0;
				4858	int end = INT_MAX;
				4859	PyObject *result;
				4860
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4861	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4862	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4863	return NULL;
				4864	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4865	(PyObject *)substring);
				4866	if (substring == NULL)
				4867	return NULL;
				4868
				4869	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4870
				4871	Py_DECREF(substring);
				4872	return result;
				4873	}
				4874
				4875
				4876	static char endswith__doc__[] =
				4877	"S.endswith(suffix[, start[, end]]) -> int\n\
				4878	\n\
				4879	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4880	optional start, test S beginning at that position. With optional end, stop\n\
				4881	comparing S at that position.";
				4882
				4883	static PyObject *
				4884	unicode_endswith(PyUnicodeObject *self,
				4885	PyObject *args)
				4886	{
				4887	PyUnicodeObject *substring;
				4888	int start = 0;
				4889	int end = INT_MAX;
				4890	PyObject *result;
				4891
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4892	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4893	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4894	return NULL;
				4895	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4896	(PyObject *)substring);
				4897	if (substring == NULL)
				4898	return NULL;
				4899
				4900	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4901
				4902	Py_DECREF(substring);
				4903	return result;
				4904	}
				4905
				4906
				4907	static PyMethodDef unicode_methods[] = {
				4908
				4909	/* Order is according to common usage: often used methods should
				4910	appear first, since lookup is done sequentially. */
				4911
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4912	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				4913	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				4914	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				4915	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				4916	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				4917	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				4918	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				4919	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				4920	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				4921	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				4922	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				4923	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				4924	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
				4925	{"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
				4926	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				4927	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				4928	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				4929	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
				4930	{"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
				4931	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
				4932	{"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
				4933	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				4934	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				4935	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				4936	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				4937	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				4938	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				4939	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				4940	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				4941	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				4942	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				4943	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				4944	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				4945	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				4946	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4947	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4948	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
				4949	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4950	#endif
				4951
				4952	#if 0
				4953	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4954	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4955	#endif
				4956
				4957	{NULL, NULL}
				4958	};
				4959
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4960	static PySequenceMethods unicode_as_sequence = {
				4961	(inquiry) unicode_length, /* sq_length */
				4962	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4963	(intargfunc) unicode_repeat, /* sq_repeat */
				4964	(intargfunc) unicode_getitem, /* sq_item */
				4965	(intintargfunc) unicode_slice, /* sq_slice */
				4966	0, /* sq_ass_item */
				4967	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4968	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4969	};
				4970
				4971	static int
				4972	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4973	int index,
				4974	const void **ptr)
				4975	{
				4976	if (index != 0) {
				4977	PyErr_SetString(PyExc_SystemError,
				4978	"accessing non-existent unicode segment");
				4979	return -1;
				4980	}
				4981	ptr = (void ) self->str;
				4982	return PyUnicode_GET_DATA_SIZE(self);
				4983	}
				4984
				4985	static int
				4986	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4987	const void **ptr)
				4988	{
				4989	PyErr_SetString(PyExc_TypeError,
				4990	"cannot use unicode as modifyable buffer");
				4991	return -1;
				4992	}
				4993
				4994	static int
				4995	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4996	int *lenp)
				4997	{
				4998	if (lenp)
				4999	*lenp = PyUnicode_GET_DATA_SIZE(self);
				5000	return 1;
				5001	}
				5002
				5003	static int
				5004	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				5005	int index,
				5006	const void **ptr)
				5007	{
				5008	PyObject *str;
				5009
				5010	if (index != 0) {
				5011	PyErr_SetString(PyExc_SystemError,
				5012	"accessing non-existent unicode segment");
				5013	return -1;
				5014	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5015	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5016	if (str == NULL)
				5017	return -1;
				5018	ptr = (void ) PyString_AS_STRING(str);
				5019	return PyString_GET_SIZE(str);
				5020	}
				5021
				5022	/* Helpers for PyUnicode_Format() */
				5023
				5024	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5025	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5026	{
				5027	int argidx = *p_argidx;
				5028	if (argidx < arglen) {
				5029	(*p_argidx)++;
				5030	if (arglen < 0)
				5031	return args;
				5032	else
				5033	return PyTuple_GetItem(args, argidx);
				5034	}
				5035	PyErr_SetString(PyExc_TypeError,
				5036	"not enough arguments for format string");
				5037	return NULL;
				5038	}
				5039
				5040	#define F_LJUST (1<<0)
				5041	#define F_SIGN (1<<1)
				5042	#define F_BLANK (1<<2)
				5043	#define F_ALT (1<<3)
				5044	#define F_ZERO (1<<4)
				5045
				5046	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5047	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5048	{
				5049	register int i;
				5050	int len;
				5051	va_list va;
				5052	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5053	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5054
				5055	/* First, format the string as char array, then expand to Py_UNICODE
				5056	array. */
				5057	charbuffer = (char *)buffer;
				5058	len = vsprintf(charbuffer, format, va);
				5059	for (i = len - 1; i >= 0; i--)
				5060	buffer[i] = (Py_UNICODE) charbuffer[i];
				5061
				5062	va_end(va);
				5063	return len;
				5064	}
				5065
				5066	static int
				5067	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5068	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5069	int flags,
				5070	int prec,
				5071	int type,
				5072	PyObject *v)
				5073	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5074	/* fmt = '%#.' + `prec` + `type`
				5075	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5076	char fmt[20];
				5077	double x;
				5078
				5079	x = PyFloat_AsDouble(v);
				5080	if (x == -1.0 && PyErr_Occurred())
				5081	return -1;
				5082	if (prec < 0)
				5083	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5084	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5085	type = 'g';
				5086	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5087	/* worst case length calc to ensure no buffer overrun:
				5088	fmt = %#.<prec>g
				5089	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5090	for any double rep.)
				5091	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5092	If prec=0 the effective precision is 1 (the leading digit is
				5093	always given), therefore increase by one to 10+prec. */
				5094	if (buflen <= (size_t)10 + (size_t)prec) {
				5095	PyErr_SetString(PyExc_OverflowError,
				5096	"formatted float is too long (precision too long?)");
				5097	return -1;
				5098	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5099	return usprintf(buf, fmt, x);
				5100	}
				5101
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5102	static PyObject*
				5103	formatlong(PyObject *val, int flags, int prec, int type)
				5104	{
				5105	char *buf;
				5106	int i, len;
				5107	PyObject str; / temporary string object. */
				5108	PyUnicodeObject *result;
				5109
				5110	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5111	if (!str)
				5112	return NULL;
				5113	result = _PyUnicode_New(len);
				5114	for (i = 0; i < len; i++)
				5115	result->str[i] = buf[i];
				5116	result->str[len] = 0;
				5117	Py_DECREF(str);
				5118	return (PyObject*)result;
				5119	}
				5120
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5121	static int
				5122	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5123	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5124	int flags,
				5125	int prec,
				5126	int type,
				5127	PyObject *v)
				5128	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5129	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5130	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5131	+ 1 + 1 = 24*/
				5132	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5133	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5134	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5135
				5136	x = PyInt_AsLong(v);
				5137	if (x == -1 && PyErr_Occurred())
				5138	return -1;
				5139	if (prec < 0)
				5140	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5141	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				5142	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				5143	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				5144	PyErr_SetString(PyExc_OverflowError,
				5145	"formatted integer is too long (precision too long?)");
				5146	return -1;
				5147	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5148	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				5149	* but we want it (for consistency with other %#x conversions, and
				5150	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5151	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				5152	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				5153	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5154	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5155	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				5156	/* Only way to know what the platform does is to try it. */
				5157	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				5158	if (fmt[1] != (char)type) {
				5159	/* Supply our own leading 0x/0X -- needed under std C */
				5160	use_native_c_format = 0;
				5161	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				5162	}
				5163	}
				5164	if (use_native_c_format)
				5165	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5166	return usprintf(buf, fmt, x);
				5167	}
				5168
				5169	static int
				5170	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5171	size_t buflen,
				5172	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5173	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5174	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5175	if (PyUnicode_Check(v)) {
				5176	if (PyUnicode_GET_SIZE(v) != 1)
				5177	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5178	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5179	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5180
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5181	else if (PyString_Check(v)) {
				5182	if (PyString_GET_SIZE(v) != 1)
				5183	goto onError;
				5184	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5185	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5186
				5187	else {
				5188	/* Integer input truncated to a character */
				5189	long x;
				5190	x = PyInt_AsLong(v);
				5191	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5192	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5193	buf[0] = (char) x;
				5194	}
				5195	buf[1] = '\0';
				5196	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5197
				5198	onError:
				5199	PyErr_SetString(PyExc_TypeError,
				5200	"%c requires int or char");
				5201	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5202	}
				5203
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5204	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5205
				5206	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5207	chars are formatted. XXX This is a magic number. Each formatting
				5208	routine does bounds checking to ensure no overflow, but a better
				5209	solution may be to malloc a buffer of appropriate size for each
				5210	format. For now, the current solution is sufficient.
				5211	*/
				5212	#define FORMATBUFLEN (size_t)120
				5213
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5214	PyObject PyUnicode_Format(PyObject format,
				5215	PyObject *args)
				5216	{
				5217	Py_UNICODE fmt, res;
				5218	int fmtcnt, rescnt, reslen, arglen, argidx;
				5219	int args_owned = 0;
				5220	PyUnicodeObject *result = NULL;
				5221	PyObject *dict = NULL;
				5222	PyObject *uformat;
				5223
				5224	if (format == NULL \|\| args == NULL) {
				5225	PyErr_BadInternalCall();
				5226	return NULL;
				5227	}
				5228	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5229	if (uformat == NULL)
				5230	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5231	fmt = PyUnicode_AS_UNICODE(uformat);
				5232	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5233
				5234	reslen = rescnt = fmtcnt + 100;
				5235	result = _PyUnicode_New(reslen);
				5236	if (result == NULL)
				5237	goto onError;
				5238	res = PyUnicode_AS_UNICODE(result);
				5239
				5240	if (PyTuple_Check(args)) {
				5241	arglen = PyTuple_Size(args);
				5242	argidx = 0;
				5243	}
				5244	else {
				5245	arglen = -1;
				5246	argidx = -2;
				5247	}
				5248	if (args->ob_type->tp_as_mapping)
				5249	dict = args;
				5250
				5251	while (--fmtcnt >= 0) {
				5252	if (*fmt != '%') {
				5253	if (--rescnt < 0) {
				5254	rescnt = fmtcnt + 100;
				5255	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5256	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5257	return NULL;
				5258	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5259	--rescnt;
				5260	}
				5261	res++ = fmt++;
				5262	}
				5263	else {
				5264	/* Got a format specifier */
				5265	int flags = 0;
				5266	int width = -1;
				5267	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5268	Py_UNICODE c = '\0';
				5269	Py_UNICODE fill;
				5270	PyObject *v = NULL;
				5271	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5272	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5273	Py_UNICODE sign;
				5274	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5275	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5276
				5277	fmt++;
				5278	if (*fmt == '(') {
				5279	Py_UNICODE *keystart;
				5280	int keylen;
				5281	PyObject *key;
				5282	int pcount = 1;
				5283
				5284	if (dict == NULL) {
				5285	PyErr_SetString(PyExc_TypeError,
				5286	"format requires a mapping");
				5287	goto onError;
				5288	}
				5289	++fmt;
				5290	--fmtcnt;
				5291	keystart = fmt;
				5292	/* Skip over balanced parentheses */
				5293	while (pcount > 0 && --fmtcnt >= 0) {
				5294	if (*fmt == ')')
				5295	--pcount;
				5296	else if (*fmt == '(')
				5297	++pcount;
				5298	fmt++;
				5299	}
				5300	keylen = fmt - keystart - 1;
				5301	if (fmtcnt < 0 \|\| pcount > 0) {
				5302	PyErr_SetString(PyExc_ValueError,
				5303	"incomplete format key");
				5304	goto onError;
				5305	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5306	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5307	then looked up since Python uses strings to hold
				5308	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5309	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5310	key = PyUnicode_EncodeUTF8(keystart,
				5311	keylen,
				5312	NULL);
				5313	if (key == NULL)
				5314	goto onError;
				5315	if (args_owned) {
				5316	Py_DECREF(args);
				5317	args_owned = 0;
				5318	}
				5319	args = PyObject_GetItem(dict, key);
				5320	Py_DECREF(key);
				5321	if (args == NULL) {
				5322	goto onError;
				5323	}
				5324	args_owned = 1;
				5325	arglen = -1;
				5326	argidx = -2;
				5327	}
				5328	while (--fmtcnt >= 0) {
				5329	switch (c = *fmt++) {
				5330	case '-': flags \|= F_LJUST; continue;
				5331	case '+': flags \|= F_SIGN; continue;
				5332	case ' ': flags \|= F_BLANK; continue;
				5333	case '#': flags \|= F_ALT; continue;
				5334	case '0': flags \|= F_ZERO; continue;
				5335	}
				5336	break;
				5337	}
				5338	if (c == '*') {
				5339	v = getnextarg(args, arglen, &argidx);
				5340	if (v == NULL)
				5341	goto onError;
				5342	if (!PyInt_Check(v)) {
				5343	PyErr_SetString(PyExc_TypeError,
				5344	"* wants int");
				5345	goto onError;
				5346	}
				5347	width = PyInt_AsLong(v);
				5348	if (width < 0) {
				5349	flags \|= F_LJUST;
				5350	width = -width;
				5351	}
				5352	if (--fmtcnt >= 0)
				5353	c = *fmt++;
				5354	}
				5355	else if (c >= '0' && c <= '9') {
				5356	width = c - '0';
				5357	while (--fmtcnt >= 0) {
				5358	c = *fmt++;
				5359	if (c < '0' \|\| c > '9')
				5360	break;
				5361	if ((width*10) / 10 != width) {
				5362	PyErr_SetString(PyExc_ValueError,
				5363	"width too big");
				5364	goto onError;
				5365	}
				5366	width = width*10 + (c - '0');
				5367	}
				5368	}
				5369	if (c == '.') {
				5370	prec = 0;
				5371	if (--fmtcnt >= 0)
				5372	c = *fmt++;
				5373	if (c == '*') {
				5374	v = getnextarg(args, arglen, &argidx);
				5375	if (v == NULL)
				5376	goto onError;
				5377	if (!PyInt_Check(v)) {
				5378	PyErr_SetString(PyExc_TypeError,
				5379	"* wants int");
				5380	goto onError;
				5381	}
				5382	prec = PyInt_AsLong(v);
				5383	if (prec < 0)
				5384	prec = 0;
				5385	if (--fmtcnt >= 0)
				5386	c = *fmt++;
				5387	}
				5388	else if (c >= '0' && c <= '9') {
				5389	prec = c - '0';
				5390	while (--fmtcnt >= 0) {
				5391	c = Py_CHARMASK(*fmt++);
				5392	if (c < '0' \|\| c > '9')
				5393	break;
				5394	if ((prec*10) / 10 != prec) {
				5395	PyErr_SetString(PyExc_ValueError,
				5396	"prec too big");
				5397	goto onError;
				5398	}
				5399	prec = prec*10 + (c - '0');
				5400	}
				5401	}
				5402	} /* prec */
				5403	if (fmtcnt >= 0) {
				5404	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5405	if (--fmtcnt >= 0)
				5406	c = *fmt++;
				5407	}
				5408	}
				5409	if (fmtcnt < 0) {
				5410	PyErr_SetString(PyExc_ValueError,
				5411	"incomplete format");
				5412	goto onError;
				5413	}
				5414	if (c != '%') {
				5415	v = getnextarg(args, arglen, &argidx);
				5416	if (v == NULL)
				5417	goto onError;
				5418	}
				5419	sign = 0;
				5420	fill = ' ';
				5421	switch (c) {
				5422
				5423	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5424	pbuf = formatbuf;
				5425	/* presume that buffer length is at least 1 */
				5426	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5427	len = 1;
				5428	break;
				5429
				5430	case 's':
				5431	case 'r':
				5432	if (PyUnicode_Check(v) && c == 's') {
				5433	temp = v;
				5434	Py_INCREF(temp);
				5435	}
				5436	else {
				5437	PyObject *unicode;
				5438	if (c == 's')
				5439	temp = PyObject_Str(v);
				5440	else
				5441	temp = PyObject_Repr(v);
				5442	if (temp == NULL)
				5443	goto onError;
				5444	if (!PyString_Check(temp)) {
				5445	/* XXX Note: this should never happen, since
				5446	PyObject_Repr() and PyObject_Str() assure
				5447	this */
				5448	Py_DECREF(temp);
				5449	PyErr_SetString(PyExc_TypeError,
				5450	"%s argument has non-string str()");
				5451	goto onError;
				5452	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5453	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5454	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5455	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5456	"strict");
				5457	Py_DECREF(temp);
				5458	temp = unicode;
				5459	if (temp == NULL)
				5460	goto onError;
				5461	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5462	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5463	len = PyUnicode_GET_SIZE(temp);
				5464	if (prec >= 0 && len > prec)
				5465	len = prec;
				5466	break;
				5467
				5468	case 'i':
				5469	case 'd':
				5470	case 'u':
				5471	case 'o':
				5472	case 'x':
				5473	case 'X':
				5474	if (c == 'i')
				5475	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5476	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5477	temp = formatlong(v, flags, prec, c);
				5478	if (!temp)
				5479	goto onError;
				5480	pbuf = PyUnicode_AS_UNICODE(temp);
				5481	len = PyUnicode_GET_SIZE(temp);
				5482	/* unbounded ints can always produce
				5483	a sign character! */
				5484	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5485	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5486	else {
				5487	pbuf = formatbuf;
				5488	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5489	flags, prec, c, v);
				5490	if (len < 0)
				5491	goto onError;
				5492	/* only d conversion is signed */
				5493	sign = c == 'd';
				5494	}
				5495	if (flags & F_ZERO)
				5496	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5497	break;
				5498
				5499	case 'e':
				5500	case 'E':
				5501	case 'f':
				5502	case 'g':
				5503	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5504	pbuf = formatbuf;
				5505	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5506	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5507	if (len < 0)
				5508	goto onError;
				5509	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5510	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5511	fill = '0';
				5512	break;
				5513
				5514	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5515	pbuf = formatbuf;
				5516	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5517	if (len < 0)
				5518	goto onError;
				5519	break;
				5520
				5521	default:
				5522	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5523	"unsupported format character '%c' (0x%x) "
				5524	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5525	(31<=c && c<=126) ? c : '?',
				5526	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5527	goto onError;
				5528	}
				5529	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5530	if (pbuf == '-' \|\| pbuf == '+') {
				5531	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5532	len--;
				5533	}
				5534	else if (flags & F_SIGN)
				5535	sign = '+';
				5536	else if (flags & F_BLANK)
				5537	sign = ' ';
				5538	else
				5539	sign = 0;
				5540	}
				5541	if (width < len)
				5542	width = len;
				5543	if (rescnt < width + (sign != 0)) {
				5544	reslen -= rescnt;
				5545	rescnt = width + fmtcnt + 100;
				5546	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5547	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5548	return NULL;
				5549	res = PyUnicode_AS_UNICODE(result)
				5550	+ reslen - rescnt;
				5551	}
				5552	if (sign) {
				5553	if (fill != ' ')
				5554	*res++ = sign;
				5555	rescnt--;
				5556	if (width > len)
				5557	width--;
				5558	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5559	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5560	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5561	assert(pbuf[1] == c);
				5562	if (fill != ' ') {
				5563	res++ = pbuf++;
				5564	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5565	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5566	rescnt -= 2;
				5567	width -= 2;
				5568	if (width < 0)
				5569	width = 0;
				5570	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5571	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5572	if (width > len && !(flags & F_LJUST)) {
				5573	do {
				5574	--rescnt;
				5575	*res++ = fill;
				5576	} while (--width > len);
				5577	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5578	if (fill == ' ') {
				5579	if (sign)
				5580	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5581	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5582	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5583	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5584	res++ = pbuf++;
				5585	res++ = pbuf++;
				5586	}
				5587	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5588	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5589	res += len;
				5590	rescnt -= len;
				5591	while (--width >= len) {
				5592	--rescnt;
				5593	*res++ = ' ';
				5594	}
				5595	if (dict && (argidx < arglen) && c != '%') {
				5596	PyErr_SetString(PyExc_TypeError,
				5597	"not all arguments converted");
				5598	goto onError;
				5599	}
				5600	Py_XDECREF(temp);
				5601	} /* '%' */
				5602	} /* until end */
				5603	if (argidx < arglen && !dict) {
				5604	PyErr_SetString(PyExc_TypeError,
				5605	"not all arguments converted");
				5606	goto onError;
				5607	}
				5608
				5609	if (args_owned) {
				5610	Py_DECREF(args);
				5611	}
				5612	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5613	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5614	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5615	return (PyObject *)result;
				5616
				5617	onError:
				5618	Py_XDECREF(result);
				5619	Py_DECREF(uformat);
				5620	if (args_owned) {
				5621	Py_DECREF(args);
				5622	}
				5623	return NULL;
				5624	}
				5625
				5626	static PyBufferProcs unicode_as_buffer = {
				5627	(getreadbufferproc) unicode_buffer_getreadbuf,
				5628	(getwritebufferproc) unicode_buffer_getwritebuf,
				5629	(getsegcountproc) unicode_buffer_getsegcount,
				5630	(getcharbufferproc) unicode_buffer_getcharbuf,
				5631	};
				5632
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5633	staticforward PyObject *
				5634	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5635
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5636	static PyObject *
				5637	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5638	{
				5639	PyObject *x = NULL;
				5640	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5641	char *encoding = NULL;
				5642	char *errors = NULL;
				5643
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5644	if (type != &PyUnicode_Type)
				5645	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5646	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5647	kwlist, &x, &encoding, &errors))
				5648	return NULL;
				5649	if (x == NULL)
				5650	return (PyObject *)_PyUnicode_New(0);
				5651	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5652	}
				5653
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5654	static PyObject *
				5655	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5656	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5657	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5658	int n;
				5659
				5660	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5661	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5662	if (tmp == NULL)
				5663	return NULL;
				5664	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5665	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5666	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5667	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5668	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5669	if (pnew->str == NULL) {
				5670	_Py_ForgetReference((PyObject *)pnew);
				5671	PyObject_DEL(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5672	return NULL;
				5673	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5674	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5675	pnew->length = n;
				5676	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5677	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5678	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5679	}
				5680
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5681	static char unicode_doc[] =
				5682	"unicode(string [, encoding[, errors]]) -> object\n\
				5683	\n\
				5684	Create a new Unicode object from the given encoded string.\n\
				5685	encoding defaults to the current default string encoding and \n\
				5686	errors, defining the error handling, to 'strict'.";
				5687
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5688	PyTypeObject PyUnicode_Type = {
				5689	PyObject_HEAD_INIT(&PyType_Type)
				5690	0, /* ob_size */
				5691	"unicode", /* tp_name */
				5692	sizeof(PyUnicodeObject), /* tp_size */
				5693	0, /* tp_itemsize */
				5694	/* Slots */
				5695	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5696	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5697	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5698	0, /* tp_setattr */
				5699	(cmpfunc) unicode_compare, /* tp_compare */
				5700	(reprfunc) unicode_repr, /* tp_repr */
				5701	0, /* tp_as_number */
				5702	&unicode_as_sequence, /* tp_as_sequence */
				5703	0, /* tp_as_mapping */
				5704	(hashfunc) unicode_hash, /* tp_hash*/
				5705	0, /* tp_call*/
				5706	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5707	PyObject_GenericGetAttr, /* tp_getattro */
				5708	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5709	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5710	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5711	unicode_doc, /* tp_doc */
				5712	0, /* tp_traverse */
				5713	0, /* tp_clear */
				5714	0, /* tp_richcompare */
				5715	0, /* tp_weaklistoffset */
				5716	0, /* tp_iter */
				5717	0, /* tp_iternext */
				5718	unicode_methods, /* tp_methods */
				5719	0, /* tp_members */
				5720	0, /* tp_getset */
				5721	0, /* tp_base */
				5722	0, /* tp_dict */
				5723	0, /* tp_descr_get */
				5724	0, /* tp_descr_set */
				5725	0, /* tp_dictoffset */
				5726	0, /* tp_init */
				5727	0, /* tp_alloc */
				5728	unicode_new, /* tp_new */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5729	};
				5730
				5731	/* Initialize the Unicode implementation */
				5732
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5733	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5734	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5735	int i;
				5736
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5737	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5738	unicode_freelist = NULL;
				5739	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5740	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5741	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5742	for (i = 0; i < 256; i++)
				5743	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5744	}
				5745
				5746	/* Finalize the Unicode implementation */
				5747
				5748	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5749	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5750	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5751	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5752	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5753
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5754	Py_XDECREF(unicode_empty);
				5755	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5756
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5757	for (i = 0; i < 256; i++) {
				5758	if (unicode_latin1[i]) {
				5759	Py_DECREF(unicode_latin1[i]);
				5760	unicode_latin1[i] = NULL;
				5761	}
				5762	}
				5763
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5764	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5765	PyUnicodeObject *v = u;
				5766	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5767	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5768	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5769	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5770	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5771	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5772	unicode_freelist = NULL;
				5773	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5774	}