Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 83efa8167a8d3c1595d1257367bb8a905cae0e89 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	168	if (unicode->defenc) {
				169	Py_DECREF(unicode->defenc);
				170	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	216	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	217	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	219	/* Keep-Alive optimization: we only upsize the buffer,
				220	never downsize it. */
				221	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	223	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	224	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	227	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	229	}
				230	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	231	}
				232	else {
				233	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				234	if (unicode == NULL)
				235	return NULL;
				236	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				237	}
				238
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	239	if (!unicode->str) {
				240	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	241	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	unicode->str[length] = 0;
				244	unicode->length = length;
				245	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	246	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	248
				249	onError:
				250	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	251	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	252	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254
				255	static
				256	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				257	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	259	/* Keep-Alive optimization */
				260	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	261	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	unicode->str = NULL;
				263	unicode->length = 0;
				264	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	265	if (unicode->defenc) {
				266	Py_DECREF(unicode->defenc);
				267	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	268	}
				269	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	270	(PyUnicodeObject *)unicode = unicode_freelist;
				271	unicode_freelist = unicode;
				272	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	273	}
				274	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	275	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	276	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	277	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	278	}
				279	}
				280
				281	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				282	int size)
				283	{
				284	PyUnicodeObject *unicode;
				285
				286	unicode = _PyUnicode_New(size);
				287	if (!unicode)
				288	return NULL;
				289
				290	/* Copy the Unicode data into the new object */
				291	if (u != NULL)
				292	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	#ifdef HAVE_WCHAR_H
				298
				299	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				300	int size)
				301	{
				302	PyUnicodeObject *unicode;
				303
				304	if (w == NULL) {
				305	PyErr_BadInternalCall();
				306	return NULL;
				307	}
				308
				309	unicode = _PyUnicode_New(size);
				310	if (!unicode)
				311	return NULL;
				312
				313	/* Copy the wchar_t data into the new object */
				314	#ifdef HAVE_USABLE_WCHAR_T
				315	memcpy(unicode->str, w, size * sizeof(wchar_t));
				316	#else
				317	{
				318	register Py_UNICODE *u;
				319	register int i;
				320	u = PyUnicode_AS_UNICODE(unicode);
				321	for (i = size; i >= 0; i--)
				322	u++ = w++;
				323	}
				324	#endif
				325
				326	return (PyObject *)unicode;
				327	}
				328
				329	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				330	register wchar_t *w,
				331	int size)
				332	{
				333	if (unicode == NULL) {
				334	PyErr_BadInternalCall();
				335	return -1;
				336	}
				337	if (size > PyUnicode_GET_SIZE(unicode))
				338	size = PyUnicode_GET_SIZE(unicode);
				339	#ifdef HAVE_USABLE_WCHAR_T
				340	memcpy(w, unicode->str, size * sizeof(wchar_t));
				341	#else
				342	{
				343	register Py_UNICODE *u;
				344	register int i;
				345	u = PyUnicode_AS_UNICODE(unicode);
				346	for (i = size; i >= 0; i--)
				347	w++ = u++;
				348	}
				349	#endif
				350
				351	return size;
				352	}
				353
				354	#endif
				355
				356	PyObject PyUnicode_FromObject(register PyObject obj)
				357	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	358	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				359	}
				360
				361	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				362	const char *encoding,
				363	const char *errors)
				364	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	365	const char *s;
				366	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	367	int owned = 0;
				368	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	369
				370	if (obj == NULL) {
				371	PyErr_BadInternalCall();
				372	return NULL;
				373	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	374
				375	/* Coerce object */
				376	if (PyInstance_Check(obj)) {
				377	PyObject *func;
				378	func = PyObject_GetAttrString(obj, "__str__");
				379	if (func == NULL) {
				380	PyErr_SetString(PyExc_TypeError,
				381	"coercing to Unicode: instance doesn't define __str__");
				382	return NULL;
				383	}
				384	obj = PyEval_CallObject(func, NULL);
				385	Py_DECREF(func);
				386	if (obj == NULL)
				387	return NULL;
				388	owned = 1;
				389	}
				390	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	391	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	392	v = obj;
				393	if (encoding) {
				394	PyErr_SetString(PyExc_TypeError,
				395	"decoding Unicode is not supported");
				396	return NULL;
				397	}
				398	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399	}
				400	else if (PyString_Check(obj)) {
				401	s = PyString_AS_STRING(obj);
				402	len = PyString_GET_SIZE(obj);
				403	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	404	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				405	/* Overwrite the error message with something more useful in
				406	case of a TypeError. */
				407	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	408	PyErr_Format(PyExc_TypeError,
				409	"coercing to Unicode: need string or buffer, "
				410	"%.80s found",
				411	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414
				415	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	if (len == 0) {
				417	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	else
				421	v = PyUnicode_Decode(s, len, encoding, errors);
				422	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return v;
				427
				428	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	429	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	430	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	431	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	432	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	433	}
				434
				435	PyObject PyUnicode_Decode(const char s,
				436	int size,
				437	const char *encoding,
				438	const char *errors)
				439	{
				440	PyObject buffer = NULL, unicode;
				441
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	if (encoding == NULL)
				443	encoding = PyUnicode_GetDefaultEncoding();
				444
				445	/* Shortcuts for common default encodings */
				446	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	447	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	448	else if (strcmp(encoding, "latin-1") == 0)
				449	return PyUnicode_DecodeLatin1(s, size, errors);
				450	else if (strcmp(encoding, "ascii") == 0)
				451	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452
				453	/* Decode via the codec registry */
				454	buffer = PyBuffer_FromMemory((void *)s, size);
				455	if (buffer == NULL)
				456	goto onError;
				457	unicode = PyCodec_Decode(buffer, encoding, errors);
				458	if (unicode == NULL)
				459	goto onError;
				460	if (!PyUnicode_Check(unicode)) {
				461	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	462	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	463	unicode->ob_type->tp_name);
				464	Py_DECREF(unicode);
				465	goto onError;
				466	}
				467	Py_DECREF(buffer);
				468	return unicode;
				469
				470	onError:
				471	Py_XDECREF(buffer);
				472	return NULL;
				473	}
				474
				475	PyObject PyUnicode_Encode(const Py_UNICODE s,
				476	int size,
				477	const char *encoding,
				478	const char *errors)
				479	{
				480	PyObject v, unicode;
				481
				482	unicode = PyUnicode_FromUnicode(s, size);
				483	if (unicode == NULL)
				484	return NULL;
				485	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				486	Py_DECREF(unicode);
				487	return v;
				488	}
				489
				490	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				491	const char *encoding,
				492	const char *errors)
				493	{
				494	PyObject *v;
				495
				496	if (!PyUnicode_Check(unicode)) {
				497	PyErr_BadArgument();
				498	goto onError;
				499	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	500
				501	if (encoding == NULL)
				502	encoding = PyUnicode_GetDefaultEncoding();
				503
				504	/* Shortcuts for common default encodings */
				505	if (errors == NULL) {
				506	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	508	else if (strcmp(encoding, "latin-1") == 0)
				509	return PyUnicode_AsLatin1String(unicode);
				510	else if (strcmp(encoding, "ascii") == 0)
				511	return PyUnicode_AsASCIIString(unicode);
				512	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	513
				514	/* Encode via the codec registry */
				515	v = PyCodec_Encode(unicode, encoding, errors);
				516	if (v == NULL)
				517	goto onError;
				518	/* XXX Should we really enforce this ? */
				519	if (!PyString_Check(v)) {
				520	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	521	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	522	v->ob_type->tp_name);
				523	Py_DECREF(v);
				524	goto onError;
				525	}
				526	return v;
				527
				528	onError:
				529	return NULL;
				530	}
				531
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	532	/* Return a Python string holding the default encoded value of the
				533	Unicode object.
				534
				535	The resulting string is cached in the Unicode object for subsequent
				536	usage by this function. The cached version is needed to implement
				537	the character buffer interface and will live (at least) as long as
				538	the Unicode object itself.
				539
				540	The refcount of the string is not incremented.
				541
				542	* Exported for internal use by the interpreter only !!! *
				543
				544	*/
				545
				546	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				547	const char *errors)
				548	{
				549	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				550
				551	if (v)
				552	return v;
				553	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				554	if (v && errors == NULL)
				555	((PyUnicodeObject *)unicode)->defenc = v;
				556	return v;
				557	}
				558
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				560	{
				561	if (!PyUnicode_Check(unicode)) {
				562	PyErr_BadArgument();
				563	goto onError;
				564	}
				565	return PyUnicode_AS_UNICODE(unicode);
				566
				567	onError:
				568	return NULL;
				569	}
				570
				571	int PyUnicode_GetSize(PyObject *unicode)
				572	{
				573	if (!PyUnicode_Check(unicode)) {
				574	PyErr_BadArgument();
				575	goto onError;
				576	}
				577	return PyUnicode_GET_SIZE(unicode);
				578
				579	onError:
				580	return -1;
				581	}
				582
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	583	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	584	{
				585	return unicode_default_encoding;
				586	}
				587
				588	int PyUnicode_SetDefaultEncoding(const char *encoding)
				589	{
				590	PyObject *v;
				591
				592	/* Make sure the encoding is valid. As side effect, this also
				593	loads the encoding into the codec registry cache. */
				594	v = _PyCodec_Lookup(encoding);
				595	if (v == NULL)
				596	goto onError;
				597	Py_DECREF(v);
				598	strncpy(unicode_default_encoding,
				599	encoding,
				600	sizeof(unicode_default_encoding));
				601	return 0;
				602
				603	onError:
				604	return -1;
				605	}
				606
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	607	/* --- UTF-8 Codec -------------------------------------------------------- */
				608
				609	static
				610	char utf8_code_length[256] = {
				611	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				612	illegal prefix. see RFC 2279 for details */
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				616	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				617	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				618	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				619	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				620	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				621	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				622	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				623	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				624	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				625	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				626	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				627	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				628	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				629	};
				630
				631	static
				632	int utf8_decoding_error(const char **source,
				633	Py_UNICODE **dest,
				634	const char *errors,
				635	const char *details)
				636	{
				637	if ((errors == NULL) \|\|
				638	(strcmp(errors,"strict") == 0)) {
				639	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	640	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	641	details);
				642	return -1;
				643	}
				644	else if (strcmp(errors,"ignore") == 0) {
				645	(*source)++;
				646	return 0;
				647	}
				648	else if (strcmp(errors,"replace") == 0) {
				649	(*source)++;
				650	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				651	(*dest)++;
				652	return 0;
				653	}
				654	else {
				655	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	656	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	657	errors);
				658	return -1;
				659	}
				660	}
				661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	662	PyObject PyUnicode_DecodeUTF8(const char s,
				663	int size,
				664	const char *errors)
				665	{
				666	int n;
				667	const char *e;
				668	PyUnicodeObject *unicode;
				669	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	670	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	671
				672	/* Note: size will always be longer than the resulting Unicode
				673	character count */
				674	unicode = _PyUnicode_New(size);
				675	if (!unicode)
				676	return NULL;
				677	if (size == 0)
				678	return (PyObject *)unicode;
				679
				680	/* Unpack UTF-8 encoded data */
				681	p = unicode->str;
				682	e = s + size;
				683
				684	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	685	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686
				687	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	688	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	s++;
				690	continue;
				691	}
				692
				693	n = utf8_code_length[ch];
				694
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	695	if (s + n > e) {
				696	errmsg = "unexpected end of data";
				697	goto utf8Error;
				698	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699
				700	switch (n) {
				701
				702	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	703	errmsg = "unexpected code byte";
				704	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	705	break;
				706
				707	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	708	errmsg = "internal error";
				709	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	710	break;
				711
				712	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	if ((s[1] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	717	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	if (ch < 0x80) {
				719	errmsg = "illegal encoding";
				720	goto utf8Error;
				721	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	722	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	723	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	724	break;
				725
				726	case 3:
				727	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	728	(s[2] & 0xc0) != 0x80) {
				729	errmsg = "invalid data";
				730	goto utf8Error;
				731	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	732	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	733	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				734	errmsg = "illegal encoding";
				735	goto utf8Error;
				736	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	738	*p++ = (Py_UNICODE)ch;
				739	break;
				740
				741	case 4:
				742	if ((s[1] & 0xc0) != 0x80 \|\|
				743	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	(s[3] & 0xc0) != 0x80) {
				745	errmsg = "invalid data";
				746	goto utf8Error;
				747	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	748	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				749	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				750	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	751	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				752	byte encoding */
				753	(ch > 0x10ffff)) { /* maximum value allowed for
				754	UTF-16 */
				755	errmsg = "illegal encoding";
				756	goto utf8Error;
				757	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	758	/* compute and append the two surrogates: */
				759
				760	/* translate from 10000..10FFFF to 0..FFFF */
				761	ch -= 0x10000;
				762
				763	/* high surrogate = top 10 bits added to D800 */
				764	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				765
				766	/* low surrogate = bottom 10 bits added to DC00 */
				767	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	768	break;
				769
				770	default:
				771	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	772	errmsg = "unsupported Unicode code range";
				773	goto utf8Error;
				774	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775	}
				776	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	777	continue;
				778
				779	utf8Error:
				780	if (utf8_decoding_error(&s, &p, errors, errmsg))
				781	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	782	}
				783
				784	/* Adjust length */
				785	if (_PyUnicode_Resize(unicode, p - unicode->str))
				786	goto onError;
				787
				788	return (PyObject *)unicode;
				789
				790	onError:
				791	Py_DECREF(unicode);
				792	return NULL;
				793	}
				794
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	795	/* Not used anymore, now that the encoder supports UTF-16
				796	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	797	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	798	static
				799	int utf8_encoding_error(const Py_UNICODE **source,
				800	char **dest,
				801	const char *errors,
				802	const char *details)
				803	{
				804	if ((errors == NULL) \|\|
				805	(strcmp(errors,"strict") == 0)) {
				806	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	807	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	details);
				809	return -1;
				810	}
				811	else if (strcmp(errors,"ignore") == 0) {
				812	return 0;
				813	}
				814	else if (strcmp(errors,"replace") == 0) {
				815	**dest = '?';
				816	(*dest)++;
				817	return 0;
				818	}
				819	else {
				820	PyErr_Format(PyExc_ValueError,
				821	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	822	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	errors);
				824	return -1;
				825	}
				826	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	827	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	828
				829	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				830	int size,
				831	const char *errors)
				832	{
				833	PyObject *v;
				834	char *p;
				835	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	836	Py_UCS4 ch2;
				837	unsigned int cbAllocated = 3 * size;
				838	unsigned int cbWritten = 0;
				839	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	842	if (v == NULL)
				843	return NULL;
				844	if (size == 0)
				845	goto done;
				846
				847	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	while (i < size) {
				849	Py_UCS4 ch = s[i++];
				850	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	851	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	852	cbWritten++;
				853	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	854	else if (ch < 0x0800) {
				855	*p++ = 0xc0 \| (ch >> 6);
				856	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	857	cbWritten += 2;
				858	}
				859	else {
				860	/* Check for high surrogate */
				861	if (0xD800 <= ch && ch <= 0xDBFF) {
				862	if (i != size) {
				863	ch2 = s[i];
				864	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				865
				866	if (cbWritten >= (cbAllocated - 4)) {
				867	/* Provide enough room for some more
				868	surrogates */
				869	cbAllocated += 4*10;
				870	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	871	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	872	}
				873
				874	/* combine the two values */
				875	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				876
				877	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	878	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	879	i++;
				880	cbWritten += 4;
				881	}
				882	}
				883	}
				884	else {
				885	*p++ = (char)(0xe0 \| (ch >> 12));
				886	cbWritten += 3;
				887	}
				888	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				889	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	}
				891	}
				892	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	893	if (_PyString_Resize(&v, p - q))
				894	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	895
				896	done:
				897	return v;
				898
				899	onError:
				900	Py_DECREF(v);
				901	return NULL;
				902	}
				903
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	904	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				905	{
				906	PyObject *str;
				907
				908	if (!PyUnicode_Check(unicode)) {
				909	PyErr_BadArgument();
				910	return NULL;
				911	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	912	str = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				913	PyUnicode_GET_SIZE(unicode),
				914	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	915	if (str == NULL)
				916	return NULL;
				917	Py_INCREF(str);
				918	return str;
				919	}
				920
				921	/* --- UTF-16 Codec ------------------------------------------------------- */
				922
				923	static
				924	int utf16_decoding_error(const Py_UNICODE **source,
				925	Py_UNICODE **dest,
				926	const char *errors,
				927	const char *details)
				928	{
				929	if ((errors == NULL) \|\|
				930	(strcmp(errors,"strict") == 0)) {
				931	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	932	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	933	details);
				934	return -1;
				935	}
				936	else if (strcmp(errors,"ignore") == 0) {
				937	return 0;
				938	}
				939	else if (strcmp(errors,"replace") == 0) {
				940	if (dest) {
				941	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				942	(*dest)++;
				943	}
				944	return 0;
				945	}
				946	else {
				947	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	948	"UTF-16 decoding error; "
				949	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	950	errors);
				951	return -1;
				952	}
				953	}
				954
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	955	PyObject PyUnicode_DecodeUTF16(const char s,
				956	int size,
				957	const char *errors,
				958	int *byteorder)
				959	{
				960	PyUnicodeObject *unicode;
				961	Py_UNICODE *p;
				962	const Py_UNICODE q, e;
				963	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	964	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	965
				966	/* size should be an even number */
				967	if (size % sizeof(Py_UNICODE) != 0) {
				968	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				969	return NULL;
				970	/* The remaining input chars are ignored if we fall through
				971	here... */
				972	}
				973
				974	/* Note: size will always be longer than the resulting Unicode
				975	character count */
				976	unicode = _PyUnicode_New(size);
				977	if (!unicode)
				978	return NULL;
				979	if (size == 0)
				980	return (PyObject *)unicode;
				981
				982	/* Unpack UTF-16 encoded data */
				983	p = unicode->str;
				984	q = (Py_UNICODE *)s;
				985	e = q + (size / sizeof(Py_UNICODE));
				986
				987	if (byteorder)
				988	bo = *byteorder;
				989
				990	while (q < e) {
				991	register Py_UNICODE ch = *q++;
				992
				993	/* Check for BOM marks (U+FEFF) in the input and adjust
				994	current byte order setting accordingly. Swap input
				995	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				996	!) */
				997	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				998	if (ch == 0xFEFF) {
				999	bo = -1;
				1000	continue;
				1001	} else if (ch == 0xFFFE) {
				1002	bo = 1;
				1003	continue;
				1004	}
				1005	if (bo == 1)
				1006	ch = (ch >> 8) \| (ch << 8);
				1007	#else
				1008	if (ch == 0xFEFF) {
				1009	bo = 1;
				1010	continue;
				1011	} else if (ch == 0xFFFE) {
				1012	bo = -1;
				1013	continue;
				1014	}
				1015	if (bo == -1)
				1016	ch = (ch >> 8) \| (ch << 8);
				1017	#endif
				1018	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1019	*p++ = ch;
				1020	continue;
				1021	}
				1022
				1023	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1024	if (q >= e) {
				1025	errmsg = "unexpected end of data";
				1026	goto utf16Error;
				1027	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1028	if (0xDC00 <= q && q <= 0xDFFF) {
				1029	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1030	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1031	/* This is valid data (a UTF-16 surrogate pair), but
				1032	we are not able to store this information since our
				1033	Py_UNICODE type only has 16 bits... this might
				1034	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1035	errmsg = "code pairs are not supported";
				1036	goto utf16Error;
				1037	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1038	else
				1039	continue;
				1040	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1041	errmsg = "illegal encoding";
				1042	/* Fall through to report the error */
				1043
				1044	utf16Error:
				1045	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1046	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1047	}
				1048
				1049	if (byteorder)
				1050	*byteorder = bo;
				1051
				1052	/* Adjust length */
				1053	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1054	goto onError;
				1055
				1056	return (PyObject *)unicode;
				1057
				1058	onError:
				1059	Py_DECREF(unicode);
				1060	return NULL;
				1061	}
				1062
				1063	#undef UTF16_ERROR
				1064
				1065	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1066	int size,
				1067	const char *errors,
				1068	int byteorder)
				1069	{
				1070	PyObject *v;
				1071	Py_UNICODE *p;
				1072	char *q;
				1073
				1074	/* We don't create UTF-16 pairs... */
				1075	v = PyString_FromStringAndSize(NULL,
				1076	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1077	if (v == NULL)
				1078	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079
				1080	q = PyString_AS_STRING(v);
				1081	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1082	if (byteorder == 0)
				1083	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1084	if (size == 0)
				1085	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1086	if (byteorder == 0 \|\|
				1087	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1088	byteorder == -1
				1089	#else
				1090	byteorder == 1
				1091	#endif
				1092	)
				1093	memcpy(p, s, size * sizeof(Py_UNICODE));
				1094	else
				1095	while (size-- > 0) {
				1096	Py_UNICODE ch = *s++;
				1097	*p++ = (ch >> 8) \| (ch << 8);
				1098	}
				1099	done:
				1100	return v;
				1101	}
				1102
				1103	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1104	{
				1105	if (!PyUnicode_Check(unicode)) {
				1106	PyErr_BadArgument();
				1107	return NULL;
				1108	}
				1109	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1110	PyUnicode_GET_SIZE(unicode),
				1111	NULL,
				1112	0);
				1113	}
				1114
				1115	/* --- Unicode Escape Codec ----------------------------------------------- */
				1116
				1117	static
				1118	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1119	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1120	const char *errors,
				1121	const char *details)
				1122	{
				1123	if ((errors == NULL) \|\|
				1124	(strcmp(errors,"strict") == 0)) {
				1125	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1126	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1127	details);
				1128	return -1;
				1129	}
				1130	else if (strcmp(errors,"ignore") == 0) {
				1131	return 0;
				1132	}
				1133	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1134	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1135	return 0;
				1136	}
				1137	else {
				1138	PyErr_Format(PyExc_ValueError,
				1139	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1140	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	errors);
				1142	return -1;
				1143	}
				1144	}
				1145
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1146	static _Py_UCNHashAPI *pucnHash = NULL;
				1147
				1148	static
				1149	int mystrnicmp(const char s1, const char s2, size_t count)
				1150	{
				1151	char c1, c2;
				1152
				1153	if (count)
				1154	{
				1155	do
				1156	{
				1157	c1 = tolower(*(s1++));
				1158	c2 = tolower(*(s2++));
				1159	}
				1160	while(--count && c1 == c2);
				1161
				1162	return c1 - c2;
				1163	}
				1164
				1165	return 0;
				1166	}
				1167
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1168	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1169	int size,
				1170	const char *errors)
				1171	{
				1172	PyUnicodeObject *v;
				1173	Py_UNICODE p = NULL, buf = NULL;
				1174	const char *end;
				1175
				1176	/* Escaped strings will always be longer than the resulting
				1177	Unicode string, so we start with size here and then reduce the
				1178	length after conversion to the true value. */
				1179	v = _PyUnicode_New(size);
				1180	if (v == NULL)
				1181	goto onError;
				1182	if (size == 0)
				1183	return (PyObject *)v;
				1184	p = buf = PyUnicode_AS_UNICODE(v);
				1185	end = s + size;
				1186	while (s < end) {
				1187	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1188	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1189	int i;
				1190
				1191	/* Non-escape characters are interpreted as Unicode ordinals */
				1192	if (*s != '\\') {
				1193	p++ = (unsigned char)s++;
				1194	continue;
				1195	}
				1196
				1197	/* \ - Escapes */
				1198	s++;
				1199	switch (*s++) {
				1200
				1201	/* \x escapes */
				1202	case '\n': break;
				1203	case '\\': *p++ = '\\'; break;
				1204	case '\'': *p++ = '\''; break;
				1205	case '\"': *p++ = '\"'; break;
				1206	case 'b': *p++ = '\b'; break;
				1207	case 'f': p++ = '\014'; break; / FF */
				1208	case 't': *p++ = '\t'; break;
				1209	case 'n': *p++ = '\n'; break;
				1210	case 'r': *p++ = '\r'; break;
				1211	case 'v': p++ = '\013'; break; / VT */
				1212	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1213
				1214	/* \OOO (octal) escapes */
				1215	case '0': case '1': case '2': case '3':
				1216	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1217	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1218	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1219	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1220	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1221	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1222	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1223	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1224	break;
				1225
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1226	/* \xXXXX escape with 1-n hex digits. for compatibility
				1227	with 8-bit strings, this code ignores all but the last
				1228	two digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1229	case 'x':
				1230	x = 0;
				1231	c = (unsigned char)*s;
				1232	if (isxdigit(c)) {
				1233	do {
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1234	x = (x<<4) & 0xF0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1235	if ('0' <= c && c <= '9')
				1236	x += c - '0';
				1237	else if ('a' <= c && c <= 'f')
				1238	x += 10 + c - 'a';
				1239	else
				1240	x += 10 + c - 'A';
				1241	c = (unsigned char)*++s;
				1242	} while (isxdigit(c));
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1243	*p++ = (unsigned char) x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1244	} else {
				1245	*p++ = '\\';
				1246	*p++ = (unsigned char)s[-1];
				1247	}
				1248	break;
				1249
				1250	/* \uXXXX with 4 hex digits */
				1251	case 'u':
				1252	for (x = 0, i = 0; i < 4; i++) {
				1253	c = (unsigned char)s[i];
				1254	if (!isxdigit(c)) {
				1255	if (unicodeescape_decoding_error(&s, &x, errors,
				1256	"truncated \\uXXXX"))
				1257	goto onError;
				1258	i++;
				1259	break;
				1260	}
				1261	x = (x<<4) & ~0xF;
				1262	if (c >= '0' && c <= '9')
				1263	x += c - '0';
				1264	else if (c >= 'a' && c <= 'f')
				1265	x += 10 + c - 'a';
				1266	else
				1267	x += 10 + c - 'A';
				1268	}
				1269	s += i;
				1270	*p++ = x;
				1271	break;
				1272
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1273	case 'N':
				1274	/* Ok, we need to deal with Unicode Character Names now,
				1275	* make sure we've imported the hash table data...
				1276	*/
				1277	if (pucnHash == NULL)
				1278	{
				1279	PyObject mod = 0, v = 0;
				1280
				1281	mod = PyImport_ImportModule("ucnhash");
				1282	if (mod == NULL)
				1283	goto onError;
				1284	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1285	Py_DECREF(mod);
				1286	if (v == NULL)
				1287	{
				1288	goto onError;
				1289	}
				1290	pucnHash = PyCObject_AsVoidPtr(v);
				1291	Py_DECREF(v);
				1292	if (pucnHash == NULL)
				1293	{
				1294	goto onError;
				1295	}
				1296	}
				1297
				1298	if (*s == '{')
				1299	{
				1300	const char *start = s + 1;
				1301	const char *endBrace = start;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1302	Py_UCS4 value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1303	unsigned long j;
				1304
				1305	/* look for either the closing brace, or we
				1306	* exceed the maximum length of the unicode character names
				1307	*/
				1308	while (*endBrace != '}' &&
				1309	(unsigned int)(endBrace - start) <=
				1310	pucnHash->cchMax &&
				1311	endBrace < end)
				1312	{
				1313	endBrace++;
				1314	}
				1315	if (endBrace != end && *endBrace == '}')
				1316	{
				1317	j = pucnHash->hash(start, endBrace - start);
				1318	if (j > pucnHash->cKeys \|\|
				1319	mystrnicmp(
				1320	start,
				1321	((_Py_UnicodeCharacterName *)
				1322	(pucnHash->getValue(j)))->pszUCN,
				1323	(int)(endBrace - start)) != 0)
				1324	{
				1325	if (unicodeescape_decoding_error(
				1326	&s, &x, errors,
				1327	"Invalid Unicode Character Name"))
				1328	{
				1329	goto onError;
				1330	}
				1331	goto ucnFallthrough;
				1332	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1333	value = ((_Py_UnicodeCharacterName *)
				1334	(pucnHash->getValue(j)))->value;
				1335	if (value < 1<<16)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1336	{
				1337	/* In UCS-2 range, easy solution.. */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1338	*p++ = value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1339	}
				1340	else
				1341	{
				1342	/* Oops, its in UCS-4 space, */
				1343	/* compute and append the two surrogates: */
				1344	/* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1345	value -= 0x10000;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1346
				1347	/* high surrogate = top 10 bits added to D800 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1348	*p++ = 0xD800 + (value >> 10);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1349
				1350	/* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1351	*p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1352	}
				1353	s = endBrace + 1;
				1354	}
				1355	else
				1356	{
				1357	if (unicodeescape_decoding_error(
				1358	&s, &x, errors,
				1359	"Unicode name missing closing brace"))
				1360	goto onError;
				1361	goto ucnFallthrough;
				1362	}
				1363	break;
				1364	}
				1365	if (unicodeescape_decoding_error(
				1366	&s, &x, errors,
				1367	"Missing opening brace for Unicode Character Name escape"))
				1368	goto onError;
				1369	ucnFallthrough:
				1370	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1371	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1372	*p++ = '\\';
				1373	*p++ = (unsigned char)s[-1];
				1374	break;
				1375	}
				1376	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1377	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1378	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1379	return (PyObject *)v;
				1380
				1381	onError:
				1382	Py_XDECREF(v);
				1383	return NULL;
				1384	}
				1385
				1386	/* Return a Unicode-Escape string version of the Unicode object.
				1387
				1388	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1389	appropriate.
				1390
				1391	*/
				1392
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1393	static const Py_UNICODE findchar(const Py_UNICODE s,
				1394	int size,
				1395	Py_UNICODE ch);
				1396
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1397	static
				1398	PyObject unicodeescape_string(const Py_UNICODE s,
				1399	int size,
				1400	int quotes)
				1401	{
				1402	PyObject *repr;
				1403	char *p;
				1404	char *q;
				1405
				1406	static const char *hexdigit = "0123456789ABCDEF";
				1407
				1408	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1409	if (repr == NULL)
				1410	return NULL;
				1411
				1412	p = q = PyString_AS_STRING(repr);
				1413
				1414	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	*p++ = 'u';
				1416	*p++ = (findchar(s, size, '\'') &&
				1417	!findchar(s, size, '"')) ? '"' : '\'';
				1418	}
				1419	while (size-- > 0) {
				1420	Py_UNICODE ch = *s++;
				1421	/* Escape quotes */
				1422	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1423	*p++ = '\\';
				1424	*p++ = (char) ch;
				1425	}
				1426	/* Map 16-bit characters to '\uxxxx' */
				1427	else if (ch >= 256) {
				1428	*p++ = '\\';
				1429	*p++ = 'u';
				1430	*p++ = hexdigit[(ch >> 12) & 0xf];
				1431	*p++ = hexdigit[(ch >> 8) & 0xf];
				1432	*p++ = hexdigit[(ch >> 4) & 0xf];
				1433	*p++ = hexdigit[ch & 15];
				1434	}
				1435	/* Map non-printable US ASCII to '\ooo' */
				1436	else if (ch < ' ' \|\| ch >= 128) {
				1437	*p++ = '\\';
				1438	*p++ = hexdigit[(ch >> 6) & 7];
				1439	*p++ = hexdigit[(ch >> 3) & 7];
				1440	*p++ = hexdigit[ch & 7];
				1441	}
				1442	/* Copy everything else as-is */
				1443	else
				1444	*p++ = (char) ch;
				1445	}
				1446	if (quotes)
				1447	*p++ = q[1];
				1448
				1449	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1450	if (_PyString_Resize(&repr, p - q))
				1451	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1452
				1453	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1454
				1455	onError:
				1456	Py_DECREF(repr);
				1457	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1458	}
				1459
				1460	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1461	int size)
				1462	{
				1463	return unicodeescape_string(s, size, 0);
				1464	}
				1465
				1466	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1467	{
				1468	if (!PyUnicode_Check(unicode)) {
				1469	PyErr_BadArgument();
				1470	return NULL;
				1471	}
				1472	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1473	PyUnicode_GET_SIZE(unicode));
				1474	}
				1475
				1476	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1477
				1478	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1479	int size,
				1480	const char *errors)
				1481	{
				1482	PyUnicodeObject *v;
				1483	Py_UNICODE p, buf;
				1484	const char *end;
				1485	const char *bs;
				1486
				1487	/* Escaped strings will always be longer than the resulting
				1488	Unicode string, so we start with size here and then reduce the
				1489	length after conversion to the true value. */
				1490	v = _PyUnicode_New(size);
				1491	if (v == NULL)
				1492	goto onError;
				1493	if (size == 0)
				1494	return (PyObject *)v;
				1495	p = buf = PyUnicode_AS_UNICODE(v);
				1496	end = s + size;
				1497	while (s < end) {
				1498	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1499	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1500	int i;
				1501
				1502	/* Non-escape characters are interpreted as Unicode ordinals */
				1503	if (*s != '\\') {
				1504	p++ = (unsigned char)s++;
				1505	continue;
				1506	}
				1507
				1508	/* \u-escapes are only interpreted iff the number of leading
				1509	backslashes if odd */
				1510	bs = s;
				1511	for (;s < end;) {
				1512	if (*s != '\\')
				1513	break;
				1514	p++ = (unsigned char)s++;
				1515	}
				1516	if (((s - bs) & 1) == 0 \|\|
				1517	s >= end \|\|
				1518	*s != 'u') {
				1519	continue;
				1520	}
				1521	p--;
				1522	s++;
				1523
				1524	/* \uXXXX with 4 hex digits */
				1525	for (x = 0, i = 0; i < 4; i++) {
				1526	c = (unsigned char)s[i];
				1527	if (!isxdigit(c)) {
				1528	if (unicodeescape_decoding_error(&s, &x, errors,
				1529	"truncated \\uXXXX"))
				1530	goto onError;
				1531	i++;
				1532	break;
				1533	}
				1534	x = (x<<4) & ~0xF;
				1535	if (c >= '0' && c <= '9')
				1536	x += c - '0';
				1537	else if (c >= 'a' && c <= 'f')
				1538	x += 10 + c - 'a';
				1539	else
				1540	x += 10 + c - 'A';
				1541	}
				1542	s += i;
				1543	*p++ = x;
				1544	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1545	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1546	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1547	return (PyObject *)v;
				1548
				1549	onError:
				1550	Py_XDECREF(v);
				1551	return NULL;
				1552	}
				1553
				1554	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1555	int size)
				1556	{
				1557	PyObject *repr;
				1558	char *p;
				1559	char *q;
				1560
				1561	static const char *hexdigit = "0123456789ABCDEF";
				1562
				1563	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1564	if (repr == NULL)
				1565	return NULL;
				1566
				1567	p = q = PyString_AS_STRING(repr);
				1568	while (size-- > 0) {
				1569	Py_UNICODE ch = *s++;
				1570	/* Map 16-bit characters to '\uxxxx' */
				1571	if (ch >= 256) {
				1572	*p++ = '\\';
				1573	*p++ = 'u';
				1574	*p++ = hexdigit[(ch >> 12) & 0xf];
				1575	*p++ = hexdigit[(ch >> 8) & 0xf];
				1576	*p++ = hexdigit[(ch >> 4) & 0xf];
				1577	*p++ = hexdigit[ch & 15];
				1578	}
				1579	/* Copy everything else as-is */
				1580	else
				1581	*p++ = (char) ch;
				1582	}
				1583	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1584	if (_PyString_Resize(&repr, p - q))
				1585	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1586
				1587	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1588
				1589	onError:
				1590	Py_DECREF(repr);
				1591	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1592	}
				1593
				1594	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1595	{
				1596	if (!PyUnicode_Check(unicode)) {
				1597	PyErr_BadArgument();
				1598	return NULL;
				1599	}
				1600	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1601	PyUnicode_GET_SIZE(unicode));
				1602	}
				1603
				1604	/* --- Latin-1 Codec ------------------------------------------------------ */
				1605
				1606	PyObject PyUnicode_DecodeLatin1(const char s,
				1607	int size,
				1608	const char *errors)
				1609	{
				1610	PyUnicodeObject *v;
				1611	Py_UNICODE *p;
				1612
				1613	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1614	v = _PyUnicode_New(size);
				1615	if (v == NULL)
				1616	goto onError;
				1617	if (size == 0)
				1618	return (PyObject *)v;
				1619	p = PyUnicode_AS_UNICODE(v);
				1620	while (size-- > 0)
				1621	p++ = (unsigned char)s++;
				1622	return (PyObject *)v;
				1623
				1624	onError:
				1625	Py_XDECREF(v);
				1626	return NULL;
				1627	}
				1628
				1629	static
				1630	int latin1_encoding_error(const Py_UNICODE **source,
				1631	char **dest,
				1632	const char *errors,
				1633	const char *details)
				1634	{
				1635	if ((errors == NULL) \|\|
				1636	(strcmp(errors,"strict") == 0)) {
				1637	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1638	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1639	details);
				1640	return -1;
				1641	}
				1642	else if (strcmp(errors,"ignore") == 0) {
				1643	return 0;
				1644	}
				1645	else if (strcmp(errors,"replace") == 0) {
				1646	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1647	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1648	return 0;
				1649	}
				1650	else {
				1651	PyErr_Format(PyExc_ValueError,
				1652	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1653	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1654	errors);
				1655	return -1;
				1656	}
				1657	}
				1658
				1659	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1660	int size,
				1661	const char *errors)
				1662	{
				1663	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1664	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1665	repr = PyString_FromStringAndSize(NULL, size);
				1666	if (repr == NULL)
				1667	return NULL;
				1668
				1669	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1670	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1671	while (size-- > 0) {
				1672	Py_UNICODE ch = *p++;
				1673	if (ch >= 256) {
				1674	if (latin1_encoding_error(&p, &s, errors,
				1675	"ordinal not in range(256)"))
				1676	goto onError;
				1677	}
				1678	else
				1679	*s++ = (char)ch;
				1680	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1681	/* Resize if error handling skipped some characters */
				1682	if (s - start < PyString_GET_SIZE(repr))
				1683	if (_PyString_Resize(&repr, s - start))
				1684	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1685	return repr;
				1686
				1687	onError:
				1688	Py_DECREF(repr);
				1689	return NULL;
				1690	}
				1691
				1692	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1693	{
				1694	if (!PyUnicode_Check(unicode)) {
				1695	PyErr_BadArgument();
				1696	return NULL;
				1697	}
				1698	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1699	PyUnicode_GET_SIZE(unicode),
				1700	NULL);
				1701	}
				1702
				1703	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1704
				1705	static
				1706	int ascii_decoding_error(const char **source,
				1707	Py_UNICODE **dest,
				1708	const char *errors,
				1709	const char *details)
				1710	{
				1711	if ((errors == NULL) \|\|
				1712	(strcmp(errors,"strict") == 0)) {
				1713	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1714	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1715	details);
				1716	return -1;
				1717	}
				1718	else if (strcmp(errors,"ignore") == 0) {
				1719	return 0;
				1720	}
				1721	else if (strcmp(errors,"replace") == 0) {
				1722	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1723	(*dest)++;
				1724	return 0;
				1725	}
				1726	else {
				1727	PyErr_Format(PyExc_ValueError,
				1728	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1729	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1730	errors);
				1731	return -1;
				1732	}
				1733	}
				1734
				1735	PyObject PyUnicode_DecodeASCII(const char s,
				1736	int size,
				1737	const char *errors)
				1738	{
				1739	PyUnicodeObject *v;
				1740	Py_UNICODE *p;
				1741
				1742	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1743	v = _PyUnicode_New(size);
				1744	if (v == NULL)
				1745	goto onError;
				1746	if (size == 0)
				1747	return (PyObject *)v;
				1748	p = PyUnicode_AS_UNICODE(v);
				1749	while (size-- > 0) {
				1750	register unsigned char c;
				1751
				1752	c = (unsigned char)*s++;
				1753	if (c < 128)
				1754	*p++ = c;
				1755	else if (ascii_decoding_error(&s, &p, errors,
				1756	"ordinal not in range(128)"))
				1757	goto onError;
				1758	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1759	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1760	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1761	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1762	return (PyObject *)v;
				1763
				1764	onError:
				1765	Py_XDECREF(v);
				1766	return NULL;
				1767	}
				1768
				1769	static
				1770	int ascii_encoding_error(const Py_UNICODE **source,
				1771	char **dest,
				1772	const char *errors,
				1773	const char *details)
				1774	{
				1775	if ((errors == NULL) \|\|
				1776	(strcmp(errors,"strict") == 0)) {
				1777	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1778	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1779	details);
				1780	return -1;
				1781	}
				1782	else if (strcmp(errors,"ignore") == 0) {
				1783	return 0;
				1784	}
				1785	else if (strcmp(errors,"replace") == 0) {
				1786	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1787	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1788	return 0;
				1789	}
				1790	else {
				1791	PyErr_Format(PyExc_ValueError,
				1792	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1793	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1794	errors);
				1795	return -1;
				1796	}
				1797	}
				1798
				1799	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1800	int size,
				1801	const char *errors)
				1802	{
				1803	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1804	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1805	repr = PyString_FromStringAndSize(NULL, size);
				1806	if (repr == NULL)
				1807	return NULL;
				1808
				1809	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1810	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1811	while (size-- > 0) {
				1812	Py_UNICODE ch = *p++;
				1813	if (ch >= 128) {
				1814	if (ascii_encoding_error(&p, &s, errors,
				1815	"ordinal not in range(128)"))
				1816	goto onError;
				1817	}
				1818	else
				1819	*s++ = (char)ch;
				1820	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1821	/* Resize if error handling skipped some characters */
				1822	if (s - start < PyString_GET_SIZE(repr))
				1823	if (_PyString_Resize(&repr, s - start))
				1824	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1825	return repr;
				1826
				1827	onError:
				1828	Py_DECREF(repr);
				1829	return NULL;
				1830	}
				1831
				1832	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1833	{
				1834	if (!PyUnicode_Check(unicode)) {
				1835	PyErr_BadArgument();
				1836	return NULL;
				1837	}
				1838	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1839	PyUnicode_GET_SIZE(unicode),
				1840	NULL);
				1841	}
				1842
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1843	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1844
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1845	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1846
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1847	PyObject PyUnicode_DecodeMBCS(const char s,
				1848	int size,
				1849	const char *errors)
				1850	{
				1851	PyUnicodeObject *v;
				1852	Py_UNICODE *p;
				1853
				1854	/* First get the size of the result */
				1855	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1856	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1857	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1858
				1859	v = _PyUnicode_New(usize);
				1860	if (v == NULL)
				1861	return NULL;
				1862	if (usize == 0)
				1863	return (PyObject *)v;
				1864	p = PyUnicode_AS_UNICODE(v);
				1865	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1866	Py_DECREF(v);
				1867	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1868	}
				1869
				1870	return (PyObject *)v;
				1871	}
				1872
				1873	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1874	int size,
				1875	const char *errors)
				1876	{
				1877	PyObject *repr;
				1878	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1879	DWORD mbcssize;
				1880
				1881	/* If there are no characters, bail now! */
				1882	if (size==0)
				1883	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1884
				1885	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1886	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1887	if (mbcssize==0)
				1888	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1889
				1890	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1891	if (repr == NULL)
				1892	return NULL;
				1893	if (mbcssize==0)
				1894	return repr;
				1895
				1896	/* Do the conversion */
				1897	s = PyString_AS_STRING(repr);
				1898	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1899	Py_DECREF(repr);
				1900	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1901	}
				1902	return repr;
				1903	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1904
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1905	#endif /* MS_WIN32 */
				1906
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1907	/* --- Character Mapping Codec -------------------------------------------- */
				1908
				1909	static
				1910	int charmap_decoding_error(const char **source,
				1911	Py_UNICODE **dest,
				1912	const char *errors,
				1913	const char *details)
				1914	{
				1915	if ((errors == NULL) \|\|
				1916	(strcmp(errors,"strict") == 0)) {
				1917	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1918	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1919	details);
				1920	return -1;
				1921	}
				1922	else if (strcmp(errors,"ignore") == 0) {
				1923	return 0;
				1924	}
				1925	else if (strcmp(errors,"replace") == 0) {
				1926	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1927	(*dest)++;
				1928	return 0;
				1929	}
				1930	else {
				1931	PyErr_Format(PyExc_ValueError,
				1932	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1933	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1934	errors);
				1935	return -1;
				1936	}
				1937	}
				1938
				1939	PyObject PyUnicode_DecodeCharmap(const char s,
				1940	int size,
				1941	PyObject *mapping,
				1942	const char *errors)
				1943	{
				1944	PyUnicodeObject *v;
				1945	Py_UNICODE *p;
				1946
				1947	/* Default to Latin-1 */
				1948	if (mapping == NULL)
				1949	return PyUnicode_DecodeLatin1(s, size, errors);
				1950
				1951	v = _PyUnicode_New(size);
				1952	if (v == NULL)
				1953	goto onError;
				1954	if (size == 0)
				1955	return (PyObject *)v;
				1956	p = PyUnicode_AS_UNICODE(v);
				1957	while (size-- > 0) {
				1958	unsigned char ch = *s++;
				1959	PyObject w, x;
				1960
				1961	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1962	w = PyInt_FromLong((long)ch);
				1963	if (w == NULL)
				1964	goto onError;
				1965	x = PyObject_GetItem(mapping, w);
				1966	Py_DECREF(w);
				1967	if (x == NULL) {
				1968	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1969	/* No mapping found: default to Latin-1 mapping */
				1970	PyErr_Clear();
				1971	*p++ = (Py_UNICODE)ch;
				1972	continue;
				1973	}
				1974	goto onError;
				1975	}
				1976
				1977	/* Apply mapping */
				1978	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1979	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1980	if (value < 0 \|\| value > 65535) {
				1981	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1982	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1983	Py_DECREF(x);
				1984	goto onError;
				1985	}
				1986	*p++ = (Py_UNICODE)value;
				1987	}
				1988	else if (x == Py_None) {
				1989	/* undefined mapping */
				1990	if (charmap_decoding_error(&s, &p, errors,
				1991	"character maps to <undefined>")) {
				1992	Py_DECREF(x);
				1993	goto onError;
				1994	}
				1995	}
				1996	else if (PyUnicode_Check(x)) {
				1997	if (PyUnicode_GET_SIZE(x) != 1) {
				1998	/* 1-n mapping */
				1999	PyErr_SetString(PyExc_NotImplementedError,
				2000	"1-n mappings are currently not implemented");
				2001	Py_DECREF(x);
				2002	goto onError;
				2003	}
				2004	p++ = PyUnicode_AS_UNICODE(x);
				2005	}
				2006	else {
				2007	/* wrong return value */
				2008	PyErr_SetString(PyExc_TypeError,
				2009	"character mapping must return integer, None or unicode");
				2010	Py_DECREF(x);
				2011	goto onError;
				2012	}
				2013	Py_DECREF(x);
				2014	}
				2015	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2016	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2017	goto onError;
				2018	return (PyObject *)v;
				2019
				2020	onError:
				2021	Py_XDECREF(v);
				2022	return NULL;
				2023	}
				2024
				2025	static
				2026	int charmap_encoding_error(const Py_UNICODE **source,
				2027	char **dest,
				2028	const char *errors,
				2029	const char *details)
				2030	{
				2031	if ((errors == NULL) \|\|
				2032	(strcmp(errors,"strict") == 0)) {
				2033	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2034	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2035	details);
				2036	return -1;
				2037	}
				2038	else if (strcmp(errors,"ignore") == 0) {
				2039	return 0;
				2040	}
				2041	else if (strcmp(errors,"replace") == 0) {
				2042	**dest = '?';
				2043	(*dest)++;
				2044	return 0;
				2045	}
				2046	else {
				2047	PyErr_Format(PyExc_ValueError,
				2048	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2049	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2050	errors);
				2051	return -1;
				2052	}
				2053	}
				2054
				2055	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2056	int size,
				2057	PyObject *mapping,
				2058	const char *errors)
				2059	{
				2060	PyObject *v;
				2061	char *s;
				2062
				2063	/* Default to Latin-1 */
				2064	if (mapping == NULL)
				2065	return PyUnicode_EncodeLatin1(p, size, errors);
				2066
				2067	v = PyString_FromStringAndSize(NULL, size);
				2068	if (v == NULL)
				2069	return NULL;
				2070	s = PyString_AS_STRING(v);
				2071	while (size-- > 0) {
				2072	Py_UNICODE ch = *p++;
				2073	PyObject w, x;
				2074
				2075	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2076	w = PyInt_FromLong((long)ch);
				2077	if (w == NULL)
				2078	goto onError;
				2079	x = PyObject_GetItem(mapping, w);
				2080	Py_DECREF(w);
				2081	if (x == NULL) {
				2082	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2083	/* No mapping found: default to Latin-1 mapping if possible */
				2084	PyErr_Clear();
				2085	if (ch < 256) {
				2086	*s++ = (char)ch;
				2087	continue;
				2088	}
				2089	else if (!charmap_encoding_error(&p, &s, errors,
				2090	"missing character mapping"))
				2091	continue;
				2092	}
				2093	goto onError;
				2094	}
				2095
				2096	/* Apply mapping */
				2097	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2098	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2099	if (value < 0 \|\| value > 255) {
				2100	PyErr_SetString(PyExc_TypeError,
				2101	"character mapping must be in range(256)");
				2102	Py_DECREF(x);
				2103	goto onError;
				2104	}
				2105	*s++ = (char)value;
				2106	}
				2107	else if (x == Py_None) {
				2108	/* undefined mapping */
				2109	if (charmap_encoding_error(&p, &s, errors,
				2110	"character maps to <undefined>")) {
				2111	Py_DECREF(x);
				2112	goto onError;
				2113	}
				2114	}
				2115	else if (PyString_Check(x)) {
				2116	if (PyString_GET_SIZE(x) != 1) {
				2117	/* 1-n mapping */
				2118	PyErr_SetString(PyExc_NotImplementedError,
				2119	"1-n mappings are currently not implemented");
				2120	Py_DECREF(x);
				2121	goto onError;
				2122	}
				2123	s++ = PyString_AS_STRING(x);
				2124	}
				2125	else {
				2126	/* wrong return value */
				2127	PyErr_SetString(PyExc_TypeError,
				2128	"character mapping must return integer, None or unicode");
				2129	Py_DECREF(x);
				2130	goto onError;
				2131	}
				2132	Py_DECREF(x);
				2133	}
				2134	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2135	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2136	goto onError;
				2137	return v;
				2138
				2139	onError:
				2140	Py_DECREF(v);
				2141	return NULL;
				2142	}
				2143
				2144	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2145	PyObject *mapping)
				2146	{
				2147	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2148	PyErr_BadArgument();
				2149	return NULL;
				2150	}
				2151	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2152	PyUnicode_GET_SIZE(unicode),
				2153	mapping,
				2154	NULL);
				2155	}
				2156
				2157	static
				2158	int translate_error(const Py_UNICODE **source,
				2159	Py_UNICODE **dest,
				2160	const char *errors,
				2161	const char *details)
				2162	{
				2163	if ((errors == NULL) \|\|
				2164	(strcmp(errors,"strict") == 0)) {
				2165	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2166	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2167	details);
				2168	return -1;
				2169	}
				2170	else if (strcmp(errors,"ignore") == 0) {
				2171	return 0;
				2172	}
				2173	else if (strcmp(errors,"replace") == 0) {
				2174	**dest = '?';
				2175	(*dest)++;
				2176	return 0;
				2177	}
				2178	else {
				2179	PyErr_Format(PyExc_ValueError,
				2180	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2181	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2182	errors);
				2183	return -1;
				2184	}
				2185	}
				2186
				2187	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2188	int size,
				2189	PyObject *mapping,
				2190	const char *errors)
				2191	{
				2192	PyUnicodeObject *v;
				2193	Py_UNICODE *p;
				2194
				2195	if (mapping == NULL) {
				2196	PyErr_BadArgument();
				2197	return NULL;
				2198	}
				2199
				2200	/* Output will never be longer than input */
				2201	v = _PyUnicode_New(size);
				2202	if (v == NULL)
				2203	goto onError;
				2204	if (size == 0)
				2205	goto done;
				2206	p = PyUnicode_AS_UNICODE(v);
				2207	while (size-- > 0) {
				2208	Py_UNICODE ch = *s++;
				2209	PyObject w, x;
				2210
				2211	/* Get mapping */
				2212	w = PyInt_FromLong(ch);
				2213	if (w == NULL)
				2214	goto onError;
				2215	x = PyObject_GetItem(mapping, w);
				2216	Py_DECREF(w);
				2217	if (x == NULL) {
				2218	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2219	/* No mapping found: default to 1-1 mapping */
				2220	PyErr_Clear();
				2221	*p++ = ch;
				2222	continue;
				2223	}
				2224	goto onError;
				2225	}
				2226
				2227	/* Apply mapping */
				2228	if (PyInt_Check(x))
				2229	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2230	else if (x == Py_None) {
				2231	/* undefined mapping */
				2232	if (translate_error(&s, &p, errors,
				2233	"character maps to <undefined>")) {
				2234	Py_DECREF(x);
				2235	goto onError;
				2236	}
				2237	}
				2238	else if (PyUnicode_Check(x)) {
				2239	if (PyUnicode_GET_SIZE(x) != 1) {
				2240	/* 1-n mapping */
				2241	PyErr_SetString(PyExc_NotImplementedError,
				2242	"1-n mappings are currently not implemented");
				2243	Py_DECREF(x);
				2244	goto onError;
				2245	}
				2246	p++ = PyUnicode_AS_UNICODE(x);
				2247	}
				2248	else {
				2249	/* wrong return value */
				2250	PyErr_SetString(PyExc_TypeError,
				2251	"translate mapping must return integer, None or unicode");
				2252	Py_DECREF(x);
				2253	goto onError;
				2254	}
				2255	Py_DECREF(x);
				2256	}
				2257	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2258	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2259	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2260
				2261	done:
				2262	return (PyObject *)v;
				2263
				2264	onError:
				2265	Py_XDECREF(v);
				2266	return NULL;
				2267	}
				2268
				2269	PyObject PyUnicode_Translate(PyObject str,
				2270	PyObject *mapping,
				2271	const char *errors)
				2272	{
				2273	PyObject *result;
				2274
				2275	str = PyUnicode_FromObject(str);
				2276	if (str == NULL)
				2277	goto onError;
				2278	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2279	PyUnicode_GET_SIZE(str),
				2280	mapping,
				2281	errors);
				2282	Py_DECREF(str);
				2283	return result;
				2284
				2285	onError:
				2286	Py_XDECREF(str);
				2287	return NULL;
				2288	}
				2289
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2290	/* --- Decimal Encoder ---------------------------------------------------- */
				2291
				2292	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2293	int length,
				2294	char *output,
				2295	const char *errors)
				2296	{
				2297	Py_UNICODE p, end;
				2298
				2299	if (output == NULL) {
				2300	PyErr_BadArgument();
				2301	return -1;
				2302	}
				2303
				2304	p = s;
				2305	end = s + length;
				2306	while (p < end) {
				2307	register Py_UNICODE ch = *p++;
				2308	int decimal;
				2309
				2310	if (Py_UNICODE_ISSPACE(ch)) {
				2311	*output++ = ' ';
				2312	continue;
				2313	}
				2314	decimal = Py_UNICODE_TODECIMAL(ch);
				2315	if (decimal >= 0) {
				2316	*output++ = '0' + decimal;
				2317	continue;
				2318	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2319	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2320	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2321	continue;
				2322	}
				2323	/* All other characters are considered invalid */
				2324	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2325	PyErr_SetString(PyExc_ValueError,
				2326	"invalid decimal Unicode string");
				2327	goto onError;
				2328	}
				2329	else if (strcmp(errors, "ignore") == 0)
				2330	continue;
				2331	else if (strcmp(errors, "replace") == 0) {
				2332	*output++ = '?';
				2333	continue;
				2334	}
				2335	}
				2336	/* 0-terminate the output string */
				2337	*output++ = '\0';
				2338	return 0;
				2339
				2340	onError:
				2341	return -1;
				2342	}
				2343
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2344	/* --- Helpers ------------------------------------------------------------ */
				2345
				2346	static
				2347	int count(PyUnicodeObject *self,
				2348	int start,
				2349	int end,
				2350	PyUnicodeObject *substring)
				2351	{
				2352	int count = 0;
				2353
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2354	if (substring->length == 0)
				2355	return (end - start + 1);
				2356
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2357	end -= substring->length;
				2358
				2359	while (start <= end)
				2360	if (Py_UNICODE_MATCH(self, start, substring)) {
				2361	count++;
				2362	start += substring->length;
				2363	} else
				2364	start++;
				2365
				2366	return count;
				2367	}
				2368
				2369	int PyUnicode_Count(PyObject *str,
				2370	PyObject *substr,
				2371	int start,
				2372	int end)
				2373	{
				2374	int result;
				2375
				2376	str = PyUnicode_FromObject(str);
				2377	if (str == NULL)
				2378	return -1;
				2379	substr = PyUnicode_FromObject(substr);
				2380	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2381	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2382	return -1;
				2383	}
				2384
				2385	result = count((PyUnicodeObject *)str,
				2386	start, end,
				2387	(PyUnicodeObject *)substr);
				2388
				2389	Py_DECREF(str);
				2390	Py_DECREF(substr);
				2391	return result;
				2392	}
				2393
				2394	static
				2395	int findstring(PyUnicodeObject *self,
				2396	PyUnicodeObject *substring,
				2397	int start,
				2398	int end,
				2399	int direction)
				2400	{
				2401	if (start < 0)
				2402	start += self->length;
				2403	if (start < 0)
				2404	start = 0;
				2405
				2406	if (substring->length == 0)
				2407	return start;
				2408
				2409	if (end > self->length)
				2410	end = self->length;
				2411	if (end < 0)
				2412	end += self->length;
				2413	if (end < 0)
				2414	end = 0;
				2415
				2416	end -= substring->length;
				2417
				2418	if (direction < 0) {
				2419	for (; end >= start; end--)
				2420	if (Py_UNICODE_MATCH(self, end, substring))
				2421	return end;
				2422	} else {
				2423	for (; start <= end; start++)
				2424	if (Py_UNICODE_MATCH(self, start, substring))
				2425	return start;
				2426	}
				2427
				2428	return -1;
				2429	}
				2430
				2431	int PyUnicode_Find(PyObject *str,
				2432	PyObject *substr,
				2433	int start,
				2434	int end,
				2435	int direction)
				2436	{
				2437	int result;
				2438
				2439	str = PyUnicode_FromObject(str);
				2440	if (str == NULL)
				2441	return -1;
				2442	substr = PyUnicode_FromObject(substr);
				2443	if (substr == NULL) {
				2444	Py_DECREF(substr);
				2445	return -1;
				2446	}
				2447
				2448	result = findstring((PyUnicodeObject *)str,
				2449	(PyUnicodeObject *)substr,
				2450	start, end, direction);
				2451	Py_DECREF(str);
				2452	Py_DECREF(substr);
				2453	return result;
				2454	}
				2455
				2456	static
				2457	int tailmatch(PyUnicodeObject *self,
				2458	PyUnicodeObject *substring,
				2459	int start,
				2460	int end,
				2461	int direction)
				2462	{
				2463	if (start < 0)
				2464	start += self->length;
				2465	if (start < 0)
				2466	start = 0;
				2467
				2468	if (substring->length == 0)
				2469	return 1;
				2470
				2471	if (end > self->length)
				2472	end = self->length;
				2473	if (end < 0)
				2474	end += self->length;
				2475	if (end < 0)
				2476	end = 0;
				2477
				2478	end -= substring->length;
				2479	if (end < start)
				2480	return 0;
				2481
				2482	if (direction > 0) {
				2483	if (Py_UNICODE_MATCH(self, end, substring))
				2484	return 1;
				2485	} else {
				2486	if (Py_UNICODE_MATCH(self, start, substring))
				2487	return 1;
				2488	}
				2489
				2490	return 0;
				2491	}
				2492
				2493	int PyUnicode_Tailmatch(PyObject *str,
				2494	PyObject *substr,
				2495	int start,
				2496	int end,
				2497	int direction)
				2498	{
				2499	int result;
				2500
				2501	str = PyUnicode_FromObject(str);
				2502	if (str == NULL)
				2503	return -1;
				2504	substr = PyUnicode_FromObject(substr);
				2505	if (substr == NULL) {
				2506	Py_DECREF(substr);
				2507	return -1;
				2508	}
				2509
				2510	result = tailmatch((PyUnicodeObject *)str,
				2511	(PyUnicodeObject *)substr,
				2512	start, end, direction);
				2513	Py_DECREF(str);
				2514	Py_DECREF(substr);
				2515	return result;
				2516	}
				2517
				2518	static
				2519	const Py_UNICODE findchar(const Py_UNICODE s,
				2520	int size,
				2521	Py_UNICODE ch)
				2522	{
				2523	/* like wcschr, but doesn't stop at NULL characters */
				2524
				2525	while (size-- > 0) {
				2526	if (*s == ch)
				2527	return s;
				2528	s++;
				2529	}
				2530
				2531	return NULL;
				2532	}
				2533
				2534	/* Apply fixfct filter to the Unicode object self and return a
				2535	reference to the modified object */
				2536
				2537	static
				2538	PyObject fixup(PyUnicodeObject self,
				2539	int (fixfct)(PyUnicodeObject s))
				2540	{
				2541
				2542	PyUnicodeObject *u;
				2543
				2544	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2545	self->length);
				2546	if (u == NULL)
				2547	return NULL;
				2548	if (!fixfct(u)) {
				2549	/* fixfct should return TRUE if it modified the buffer. If
				2550	FALSE, return a reference to the original buffer instead
				2551	(to save space, not time) */
				2552	Py_INCREF(self);
				2553	Py_DECREF(u);
				2554	return (PyObject*) self;
				2555	}
				2556	return (PyObject*) u;
				2557	}
				2558
				2559	static
				2560	int fixupper(PyUnicodeObject *self)
				2561	{
				2562	int len = self->length;
				2563	Py_UNICODE *s = self->str;
				2564	int status = 0;
				2565
				2566	while (len-- > 0) {
				2567	register Py_UNICODE ch;
				2568
				2569	ch = Py_UNICODE_TOUPPER(*s);
				2570	if (ch != *s) {
				2571	status = 1;
				2572	*s = ch;
				2573	}
				2574	s++;
				2575	}
				2576
				2577	return status;
				2578	}
				2579
				2580	static
				2581	int fixlower(PyUnicodeObject *self)
				2582	{
				2583	int len = self->length;
				2584	Py_UNICODE *s = self->str;
				2585	int status = 0;
				2586
				2587	while (len-- > 0) {
				2588	register Py_UNICODE ch;
				2589
				2590	ch = Py_UNICODE_TOLOWER(*s);
				2591	if (ch != *s) {
				2592	status = 1;
				2593	*s = ch;
				2594	}
				2595	s++;
				2596	}
				2597
				2598	return status;
				2599	}
				2600
				2601	static
				2602	int fixswapcase(PyUnicodeObject *self)
				2603	{
				2604	int len = self->length;
				2605	Py_UNICODE *s = self->str;
				2606	int status = 0;
				2607
				2608	while (len-- > 0) {
				2609	if (Py_UNICODE_ISUPPER(*s)) {
				2610	s = Py_UNICODE_TOLOWER(s);
				2611	status = 1;
				2612	} else if (Py_UNICODE_ISLOWER(*s)) {
				2613	s = Py_UNICODE_TOUPPER(s);
				2614	status = 1;
				2615	}
				2616	s++;
				2617	}
				2618
				2619	return status;
				2620	}
				2621
				2622	static
				2623	int fixcapitalize(PyUnicodeObject *self)
				2624	{
				2625	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2626	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2627	return 1;
				2628	}
				2629	return 0;
				2630	}
				2631
				2632	static
				2633	int fixtitle(PyUnicodeObject *self)
				2634	{
				2635	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2636	register Py_UNICODE *e;
				2637	int previous_is_cased;
				2638
				2639	/* Shortcut for single character strings */
				2640	if (PyUnicode_GET_SIZE(self) == 1) {
				2641	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2642	if (*p != ch) {
				2643	*p = ch;
				2644	return 1;
				2645	}
				2646	else
				2647	return 0;
				2648	}
				2649
				2650	e = p + PyUnicode_GET_SIZE(self);
				2651	previous_is_cased = 0;
				2652	for (; p < e; p++) {
				2653	register const Py_UNICODE ch = *p;
				2654
				2655	if (previous_is_cased)
				2656	*p = Py_UNICODE_TOLOWER(ch);
				2657	else
				2658	*p = Py_UNICODE_TOTITLE(ch);
				2659
				2660	if (Py_UNICODE_ISLOWER(ch) \|\|
				2661	Py_UNICODE_ISUPPER(ch) \|\|
				2662	Py_UNICODE_ISTITLE(ch))
				2663	previous_is_cased = 1;
				2664	else
				2665	previous_is_cased = 0;
				2666	}
				2667	return 1;
				2668	}
				2669
				2670	PyObject PyUnicode_Join(PyObject separator,
				2671	PyObject *seq)
				2672	{
				2673	Py_UNICODE *sep;
				2674	int seplen;
				2675	PyUnicodeObject *res = NULL;
				2676	int reslen = 0;
				2677	Py_UNICODE *p;
				2678	int seqlen = 0;
				2679	int sz = 100;
				2680	int i;
				2681
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2682	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2683	if (seqlen < 0 && PyErr_Occurred())
				2684	return NULL;
				2685
				2686	if (separator == NULL) {
				2687	Py_UNICODE blank = ' ';
				2688	sep = &blank;
				2689	seplen = 1;
				2690	}
				2691	else {
				2692	separator = PyUnicode_FromObject(separator);
				2693	if (separator == NULL)
				2694	return NULL;
				2695	sep = PyUnicode_AS_UNICODE(separator);
				2696	seplen = PyUnicode_GET_SIZE(separator);
				2697	}
				2698
				2699	res = _PyUnicode_New(sz);
				2700	if (res == NULL)
				2701	goto onError;
				2702	p = PyUnicode_AS_UNICODE(res);
				2703	reslen = 0;
				2704
				2705	for (i = 0; i < seqlen; i++) {
				2706	int itemlen;
				2707	PyObject *item;
				2708
				2709	item = PySequence_GetItem(seq, i);
				2710	if (item == NULL)
				2711	goto onError;
				2712	if (!PyUnicode_Check(item)) {
				2713	PyObject *v;
				2714	v = PyUnicode_FromObject(item);
				2715	Py_DECREF(item);
				2716	item = v;
				2717	if (item == NULL)
				2718	goto onError;
				2719	}
				2720	itemlen = PyUnicode_GET_SIZE(item);
				2721	while (reslen + itemlen + seplen >= sz) {
				2722	if (_PyUnicode_Resize(res, sz*2))
				2723	goto onError;
				2724	sz *= 2;
				2725	p = PyUnicode_AS_UNICODE(res) + reslen;
				2726	}
				2727	if (i > 0) {
				2728	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2729	p += seplen;
				2730	reslen += seplen;
				2731	}
				2732	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2733	p += itemlen;
				2734	reslen += itemlen;
				2735	Py_DECREF(item);
				2736	}
				2737	if (_PyUnicode_Resize(res, reslen))
				2738	goto onError;
				2739
				2740	Py_XDECREF(separator);
				2741	return (PyObject *)res;
				2742
				2743	onError:
				2744	Py_XDECREF(separator);
				2745	Py_DECREF(res);
				2746	return NULL;
				2747	}
				2748
				2749	static
				2750	PyUnicodeObject pad(PyUnicodeObject self,
				2751	int left,
				2752	int right,
				2753	Py_UNICODE fill)
				2754	{
				2755	PyUnicodeObject *u;
				2756
				2757	if (left < 0)
				2758	left = 0;
				2759	if (right < 0)
				2760	right = 0;
				2761
				2762	if (left == 0 && right == 0) {
				2763	Py_INCREF(self);
				2764	return self;
				2765	}
				2766
				2767	u = _PyUnicode_New(left + self->length + right);
				2768	if (u) {
				2769	if (left)
				2770	Py_UNICODE_FILL(u->str, fill, left);
				2771	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2772	if (right)
				2773	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2774	}
				2775
				2776	return u;
				2777	}
				2778
				2779	#define SPLIT_APPEND(data, left, right) \
				2780	str = PyUnicode_FromUnicode(data + left, right - left); \
				2781	if (!str) \
				2782	goto onError; \
				2783	if (PyList_Append(list, str)) { \
				2784	Py_DECREF(str); \
				2785	goto onError; \
				2786	} \
				2787	else \
				2788	Py_DECREF(str);
				2789
				2790	static
				2791	PyObject split_whitespace(PyUnicodeObject self,
				2792	PyObject *list,
				2793	int maxcount)
				2794	{
				2795	register int i;
				2796	register int j;
				2797	int len = self->length;
				2798	PyObject *str;
				2799
				2800	for (i = j = 0; i < len; ) {
				2801	/* find a token */
				2802	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2803	i++;
				2804	j = i;
				2805	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2806	i++;
				2807	if (j < i) {
				2808	if (maxcount-- <= 0)
				2809	break;
				2810	SPLIT_APPEND(self->str, j, i);
				2811	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2812	i++;
				2813	j = i;
				2814	}
				2815	}
				2816	if (j < len) {
				2817	SPLIT_APPEND(self->str, j, len);
				2818	}
				2819	return list;
				2820
				2821	onError:
				2822	Py_DECREF(list);
				2823	return NULL;
				2824	}
				2825
				2826	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2827	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2828	{
				2829	register int i;
				2830	register int j;
				2831	int len;
				2832	PyObject *list;
				2833	PyObject *str;
				2834	Py_UNICODE *data;
				2835
				2836	string = PyUnicode_FromObject(string);
				2837	if (string == NULL)
				2838	return NULL;
				2839	data = PyUnicode_AS_UNICODE(string);
				2840	len = PyUnicode_GET_SIZE(string);
				2841
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2842	list = PyList_New(0);
				2843	if (!list)
				2844	goto onError;
				2845
				2846	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2847	int eol;
				2848
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2849	/* Find a line and append it */
				2850	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2851	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2852
				2853	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2854	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2855	if (i < len) {
				2856	if (data[i] == '\r' && i + 1 < len &&
				2857	data[i+1] == '\n')
				2858	i += 2;
				2859	else
				2860	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2861	if (keepends)
				2862	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2863	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2864	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2865	j = i;
				2866	}
				2867	if (j < len) {
				2868	SPLIT_APPEND(data, j, len);
				2869	}
				2870
				2871	Py_DECREF(string);
				2872	return list;
				2873
				2874	onError:
				2875	Py_DECREF(list);
				2876	Py_DECREF(string);
				2877	return NULL;
				2878	}
				2879
				2880	static
				2881	PyObject split_char(PyUnicodeObject self,
				2882	PyObject *list,
				2883	Py_UNICODE ch,
				2884	int maxcount)
				2885	{
				2886	register int i;
				2887	register int j;
				2888	int len = self->length;
				2889	PyObject *str;
				2890
				2891	for (i = j = 0; i < len; ) {
				2892	if (self->str[i] == ch) {
				2893	if (maxcount-- <= 0)
				2894	break;
				2895	SPLIT_APPEND(self->str, j, i);
				2896	i = j = i + 1;
				2897	} else
				2898	i++;
				2899	}
				2900	if (j <= len) {
				2901	SPLIT_APPEND(self->str, j, len);
				2902	}
				2903	return list;
				2904
				2905	onError:
				2906	Py_DECREF(list);
				2907	return NULL;
				2908	}
				2909
				2910	static
				2911	PyObject split_substring(PyUnicodeObject self,
				2912	PyObject *list,
				2913	PyUnicodeObject *substring,
				2914	int maxcount)
				2915	{
				2916	register int i;
				2917	register int j;
				2918	int len = self->length;
				2919	int sublen = substring->length;
				2920	PyObject *str;
				2921
				2922	for (i = j = 0; i < len - sublen; ) {
				2923	if (Py_UNICODE_MATCH(self, i, substring)) {
				2924	if (maxcount-- <= 0)
				2925	break;
				2926	SPLIT_APPEND(self->str, j, i);
				2927	i = j = i + sublen;
				2928	} else
				2929	i++;
				2930	}
				2931	if (j <= len) {
				2932	SPLIT_APPEND(self->str, j, len);
				2933	}
				2934	return list;
				2935
				2936	onError:
				2937	Py_DECREF(list);
				2938	return NULL;
				2939	}
				2940
				2941	#undef SPLIT_APPEND
				2942
				2943	static
				2944	PyObject split(PyUnicodeObject self,
				2945	PyUnicodeObject *substring,
				2946	int maxcount)
				2947	{
				2948	PyObject *list;
				2949
				2950	if (maxcount < 0)
				2951	maxcount = INT_MAX;
				2952
				2953	list = PyList_New(0);
				2954	if (!list)
				2955	return NULL;
				2956
				2957	if (substring == NULL)
				2958	return split_whitespace(self,list,maxcount);
				2959
				2960	else if (substring->length == 1)
				2961	return split_char(self,list,substring->str[0],maxcount);
				2962
				2963	else if (substring->length == 0) {
				2964	Py_DECREF(list);
				2965	PyErr_SetString(PyExc_ValueError, "empty separator");
				2966	return NULL;
				2967	}
				2968	else
				2969	return split_substring(self,list,substring,maxcount);
				2970	}
				2971
				2972	static
				2973	PyObject strip(PyUnicodeObject self,
				2974	int left,
				2975	int right)
				2976	{
				2977	Py_UNICODE *p = self->str;
				2978	int start = 0;
				2979	int end = self->length;
				2980
				2981	if (left)
				2982	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2983	start++;
				2984
				2985	if (right)
				2986	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2987	end--;
				2988
				2989	if (start == 0 && end == self->length) {
				2990	/* couldn't strip anything off, return original string */
				2991	Py_INCREF(self);
				2992	return (PyObject*) self;
				2993	}
				2994
				2995	return (PyObject*) PyUnicode_FromUnicode(
				2996	self->str + start,
				2997	end - start
				2998	);
				2999	}
				3000
				3001	static
				3002	PyObject replace(PyUnicodeObject self,
				3003	PyUnicodeObject *str1,
				3004	PyUnicodeObject *str2,
				3005	int maxcount)
				3006	{
				3007	PyUnicodeObject *u;
				3008
				3009	if (maxcount < 0)
				3010	maxcount = INT_MAX;
				3011
				3012	if (str1->length == 1 && str2->length == 1) {
				3013	int i;
				3014
				3015	/* replace characters */
				3016	if (!findchar(self->str, self->length, str1->str[0])) {
				3017	/* nothing to replace, return original string */
				3018	Py_INCREF(self);
				3019	u = self;
				3020	} else {
				3021	Py_UNICODE u1 = str1->str[0];
				3022	Py_UNICODE u2 = str2->str[0];
				3023
				3024	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3025	self->str,
				3026	self->length
				3027	);
				3028	if (u)
				3029	for (i = 0; i < u->length; i++)
				3030	if (u->str[i] == u1) {
				3031	if (--maxcount < 0)
				3032	break;
				3033	u->str[i] = u2;
				3034	}
				3035	}
				3036
				3037	} else {
				3038	int n, i;
				3039	Py_UNICODE *p;
				3040
				3041	/* replace strings */
				3042	n = count(self, 0, self->length, str1);
				3043	if (n > maxcount)
				3044	n = maxcount;
				3045	if (n == 0) {
				3046	/* nothing to replace, return original string */
				3047	Py_INCREF(self);
				3048	u = self;
				3049	} else {
				3050	u = _PyUnicode_New(
				3051	self->length + n * (str2->length - str1->length));
				3052	if (u) {
				3053	i = 0;
				3054	p = u->str;
				3055	while (i <= self->length - str1->length)
				3056	if (Py_UNICODE_MATCH(self, i, str1)) {
				3057	/* replace string segment */
				3058	Py_UNICODE_COPY(p, str2->str, str2->length);
				3059	p += str2->length;
				3060	i += str1->length;
				3061	if (--n <= 0) {
				3062	/* copy remaining part */
				3063	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3064	break;
				3065	}
				3066	} else
				3067	*p++ = self->str[i++];
				3068	}
				3069	}
				3070	}
				3071
				3072	return (PyObject *) u;
				3073	}
				3074
				3075	/* --- Unicode Object Methods --------------------------------------------- */
				3076
				3077	static char title__doc__[] =
				3078	"S.title() -> unicode\n\
				3079	\n\
				3080	Return a titlecased version of S, i.e. words start with title case\n\
				3081	characters, all remaining cased characters have lower case.";
				3082
				3083	static PyObject*
				3084	unicode_title(PyUnicodeObject self, PyObject args)
				3085	{
				3086	if (!PyArg_NoArgs(args))
				3087	return NULL;
				3088	return fixup(self, fixtitle);
				3089	}
				3090
				3091	static char capitalize__doc__[] =
				3092	"S.capitalize() -> unicode\n\
				3093	\n\
				3094	Return a capitalized version of S, i.e. make the first character\n\
				3095	have upper case.";
				3096
				3097	static PyObject*
				3098	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3099	{
				3100	if (!PyArg_NoArgs(args))
				3101	return NULL;
				3102	return fixup(self, fixcapitalize);
				3103	}
				3104
				3105	#if 0
				3106	static char capwords__doc__[] =
				3107	"S.capwords() -> unicode\n\
				3108	\n\
				3109	Apply .capitalize() to all words in S and return the result with\n\
				3110	normalized whitespace (all whitespace strings are replaced by ' ').";
				3111
				3112	static PyObject*
				3113	unicode_capwords(PyUnicodeObject self, PyObject args)
				3114	{
				3115	PyObject *list;
				3116	PyObject *item;
				3117	int i;
				3118
				3119	if (!PyArg_NoArgs(args))
				3120	return NULL;
				3121
				3122	/* Split into words */
				3123	list = split(self, NULL, -1);
				3124	if (!list)
				3125	return NULL;
				3126
				3127	/* Capitalize each word */
				3128	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3129	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3130	fixcapitalize);
				3131	if (item == NULL)
				3132	goto onError;
				3133	Py_DECREF(PyList_GET_ITEM(list, i));
				3134	PyList_SET_ITEM(list, i, item);
				3135	}
				3136
				3137	/* Join the words to form a new string */
				3138	item = PyUnicode_Join(NULL, list);
				3139
				3140	onError:
				3141	Py_DECREF(list);
				3142	return (PyObject *)item;
				3143	}
				3144	#endif
				3145
				3146	static char center__doc__[] =
				3147	"S.center(width) -> unicode\n\
				3148	\n\
				3149	Return S centered in a Unicode string of length width. Padding is done\n\
				3150	using spaces.";
				3151
				3152	static PyObject *
				3153	unicode_center(PyUnicodeObject self, PyObject args)
				3154	{
				3155	int marg, left;
				3156	int width;
				3157
				3158	if (!PyArg_ParseTuple(args, "i:center", &width))
				3159	return NULL;
				3160
				3161	if (self->length >= width) {
				3162	Py_INCREF(self);
				3163	return (PyObject*) self;
				3164	}
				3165
				3166	marg = width - self->length;
				3167	left = marg / 2 + (marg & width & 1);
				3168
				3169	return (PyObject*) pad(self, left, marg - left, ' ');
				3170	}
				3171
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3172	/* speedy UTF-16 code point order comparison */
				3173	/* gleaned from: */
				3174	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3175
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3176	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3177	{
				3178	0, 0, 0, 0, 0, 0, 0, 0,
				3179	0, 0, 0, 0, 0, 0, 0, 0,
				3180	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3181	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3182	};
				3183
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3184	static int
				3185	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3186	{
				3187	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3188
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3189	Py_UNICODE *s1 = str1->str;
				3190	Py_UNICODE *s2 = str2->str;
				3191
				3192	len1 = str1->length;
				3193	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3194
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3195	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3196	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3197	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3198
				3199	c1 = *s1++;
				3200	c2 = *s2++;
				3201	if (c1 > (1<<11) * 26)
				3202	c1 += utf16Fixup[c1>>11];
				3203	if (c2 > (1<<11) * 26)
				3204	c2 += utf16Fixup[c2>>11];
				3205
				3206	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3207	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3208	if (diff)
				3209	return (diff < 0) ? -1 : (diff != 0);
				3210	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3211	}
				3212
				3213	return (len1 < len2) ? -1 : (len1 != len2);
				3214	}
				3215
				3216	int PyUnicode_Compare(PyObject *left,
				3217	PyObject *right)
				3218	{
				3219	PyUnicodeObject u = NULL, v = NULL;
				3220	int result;
				3221
				3222	/* Coerce the two arguments */
				3223	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3224	if (u == NULL)
				3225	goto onError;
				3226	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3227	if (v == NULL)
				3228	goto onError;
				3229
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3230	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3231	if (v == u) {
				3232	Py_DECREF(u);
				3233	Py_DECREF(v);
				3234	return 0;
				3235	}
				3236
				3237	result = unicode_compare(u, v);
				3238
				3239	Py_DECREF(u);
				3240	Py_DECREF(v);
				3241	return result;
				3242
				3243	onError:
				3244	Py_XDECREF(u);
				3245	Py_XDECREF(v);
				3246	return -1;
				3247	}
				3248
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3249	int PyUnicode_Contains(PyObject *container,
				3250	PyObject *element)
				3251	{
				3252	PyUnicodeObject u = NULL, v = NULL;
				3253	int result;
				3254	register const Py_UNICODE p, e;
				3255	register Py_UNICODE ch;
				3256
				3257	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3258	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3259	if (v == NULL) {
				3260	PyErr_SetString(PyExc_TypeError,
				3261	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3262	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3263	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3264	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3265	if (u == NULL) {
				3266	Py_DECREF(v);
				3267	goto onError;
				3268	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3269
				3270	/* Check v in u */
				3271	if (PyUnicode_GET_SIZE(v) != 1) {
				3272	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3273	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3274	goto onError;
				3275	}
				3276	ch = *PyUnicode_AS_UNICODE(v);
				3277	p = PyUnicode_AS_UNICODE(u);
				3278	e = p + PyUnicode_GET_SIZE(u);
				3279	result = 0;
				3280	while (p < e) {
				3281	if (*p++ == ch) {
				3282	result = 1;
				3283	break;
				3284	}
				3285	}
				3286
				3287	Py_DECREF(u);
				3288	Py_DECREF(v);
				3289	return result;
				3290
				3291	onError:
				3292	Py_XDECREF(u);
				3293	Py_XDECREF(v);
				3294	return -1;
				3295	}
				3296
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3297	/* Concat to string or Unicode object giving a new Unicode object. */
				3298
				3299	PyObject PyUnicode_Concat(PyObject left,
				3300	PyObject *right)
				3301	{
				3302	PyUnicodeObject u = NULL, v = NULL, *w;
				3303
				3304	/* Coerce the two arguments */
				3305	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3306	if (u == NULL)
				3307	goto onError;
				3308	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3309	if (v == NULL)
				3310	goto onError;
				3311
				3312	/* Shortcuts */
				3313	if (v == unicode_empty) {
				3314	Py_DECREF(v);
				3315	return (PyObject *)u;
				3316	}
				3317	if (u == unicode_empty) {
				3318	Py_DECREF(u);
				3319	return (PyObject *)v;
				3320	}
				3321
				3322	/* Concat the two Unicode strings */
				3323	w = _PyUnicode_New(u->length + v->length);
				3324	if (w == NULL)
				3325	goto onError;
				3326	Py_UNICODE_COPY(w->str, u->str, u->length);
				3327	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3328
				3329	Py_DECREF(u);
				3330	Py_DECREF(v);
				3331	return (PyObject *)w;
				3332
				3333	onError:
				3334	Py_XDECREF(u);
				3335	Py_XDECREF(v);
				3336	return NULL;
				3337	}
				3338
				3339	static char count__doc__[] =
				3340	"S.count(sub[, start[, end]]) -> int\n\
				3341	\n\
				3342	Return the number of occurrences of substring sub in Unicode string\n\
				3343	S[start:end]. Optional arguments start and end are\n\
				3344	interpreted as in slice notation.";
				3345
				3346	static PyObject *
				3347	unicode_count(PyUnicodeObject self, PyObject args)
				3348	{
				3349	PyUnicodeObject *substring;
				3350	int start = 0;
				3351	int end = INT_MAX;
				3352	PyObject *result;
				3353
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3354	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3355	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3356	return NULL;
				3357
				3358	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3359	(PyObject *)substring);
				3360	if (substring == NULL)
				3361	return NULL;
				3362
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3363	if (start < 0)
				3364	start += self->length;
				3365	if (start < 0)
				3366	start = 0;
				3367	if (end > self->length)
				3368	end = self->length;
				3369	if (end < 0)
				3370	end += self->length;
				3371	if (end < 0)
				3372	end = 0;
				3373
				3374	result = PyInt_FromLong((long) count(self, start, end, substring));
				3375
				3376	Py_DECREF(substring);
				3377	return result;
				3378	}
				3379
				3380	static char encode__doc__[] =
				3381	"S.encode([encoding[,errors]]) -> string\n\
				3382	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3383	Return an encoded string version of S. Default encoding is the current\n\
				3384	default string encoding. errors may be given to set a different error\n\
				3385	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3386	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3387
				3388	static PyObject *
				3389	unicode_encode(PyUnicodeObject self, PyObject args)
				3390	{
				3391	char *encoding = NULL;
				3392	char *errors = NULL;
				3393	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3394	return NULL;
				3395	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3396	}
				3397
				3398	static char expandtabs__doc__[] =
				3399	"S.expandtabs([tabsize]) -> unicode\n\
				3400	\n\
				3401	Return a copy of S where all tab characters are expanded using spaces.\n\
				3402	If tabsize is not given, a tab size of 8 characters is assumed.";
				3403
				3404	static PyObject*
				3405	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3406	{
				3407	Py_UNICODE *e;
				3408	Py_UNICODE *p;
				3409	Py_UNICODE *q;
				3410	int i, j;
				3411	PyUnicodeObject *u;
				3412	int tabsize = 8;
				3413
				3414	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3415	return NULL;
				3416
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3417	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3418	i = j = 0;
				3419	e = self->str + self->length;
				3420	for (p = self->str; p < e; p++)
				3421	if (*p == '\t') {
				3422	if (tabsize > 0)
				3423	j += tabsize - (j % tabsize);
				3424	}
				3425	else {
				3426	j++;
				3427	if (p == '\n' \|\| p == '\r') {
				3428	i += j;
				3429	j = 0;
				3430	}
				3431	}
				3432
				3433	/* Second pass: create output string and fill it */
				3434	u = _PyUnicode_New(i + j);
				3435	if (!u)
				3436	return NULL;
				3437
				3438	j = 0;
				3439	q = u->str;
				3440
				3441	for (p = self->str; p < e; p++)
				3442	if (*p == '\t') {
				3443	if (tabsize > 0) {
				3444	i = tabsize - (j % tabsize);
				3445	j += i;
				3446	while (i--)
				3447	*q++ = ' ';
				3448	}
				3449	}
				3450	else {
				3451	j++;
				3452	q++ = p;
				3453	if (p == '\n' \|\| p == '\r')
				3454	j = 0;
				3455	}
				3456
				3457	return (PyObject*) u;
				3458	}
				3459
				3460	static char find__doc__[] =
				3461	"S.find(sub [,start [,end]]) -> int\n\
				3462	\n\
				3463	Return the lowest index in S where substring sub is found,\n\
				3464	such that sub is contained within s[start,end]. Optional\n\
				3465	arguments start and end are interpreted as in slice notation.\n\
				3466	\n\
				3467	Return -1 on failure.";
				3468
				3469	static PyObject *
				3470	unicode_find(PyUnicodeObject self, PyObject args)
				3471	{
				3472	PyUnicodeObject *substring;
				3473	int start = 0;
				3474	int end = INT_MAX;
				3475	PyObject *result;
				3476
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3477	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3478	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3479	return NULL;
				3480	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3481	(PyObject *)substring);
				3482	if (substring == NULL)
				3483	return NULL;
				3484
				3485	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3486
				3487	Py_DECREF(substring);
				3488	return result;
				3489	}
				3490
				3491	static PyObject *
				3492	unicode_getitem(PyUnicodeObject *self, int index)
				3493	{
				3494	if (index < 0 \|\| index >= self->length) {
				3495	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3496	return NULL;
				3497	}
				3498
				3499	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3500	}
				3501
				3502	static long
				3503	unicode_hash(PyUnicodeObject *self)
				3504	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3505	/* Since Unicode objects compare equal to their ASCII string
				3506	counterparts, they should use the individual character values
				3507	as basis for their hash value. This is needed to assure that
				3508	strings and Unicode objects behave in the same way as
				3509	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3510
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3511	register int len;
				3512	register Py_UNICODE *p;
				3513	register long x;
				3514
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3515	if (self->hash != -1)
				3516	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3517	len = PyUnicode_GET_SIZE(self);
				3518	p = PyUnicode_AS_UNICODE(self);
				3519	x = *p << 7;
				3520	while (--len >= 0)
				3521	x = (1000003x) ^ p++;
				3522	x ^= PyUnicode_GET_SIZE(self);
				3523	if (x == -1)
				3524	x = -2;
				3525	self->hash = x;
				3526	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3527	}
				3528
				3529	static char index__doc__[] =
				3530	"S.index(sub [,start [,end]]) -> int\n\
				3531	\n\
				3532	Like S.find() but raise ValueError when the substring is not found.";
				3533
				3534	static PyObject *
				3535	unicode_index(PyUnicodeObject self, PyObject args)
				3536	{
				3537	int result;
				3538	PyUnicodeObject *substring;
				3539	int start = 0;
				3540	int end = INT_MAX;
				3541
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3542	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3543	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3544	return NULL;
				3545
				3546	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3547	(PyObject *)substring);
				3548	if (substring == NULL)
				3549	return NULL;
				3550
				3551	result = findstring(self, substring, start, end, 1);
				3552
				3553	Py_DECREF(substring);
				3554	if (result < 0) {
				3555	PyErr_SetString(PyExc_ValueError, "substring not found");
				3556	return NULL;
				3557	}
				3558	return PyInt_FromLong(result);
				3559	}
				3560
				3561	static char islower__doc__[] =
				3562	"S.islower() -> int\n\
				3563	\n\
				3564	Return 1 if all cased characters in S are lowercase and there is\n\
				3565	at least one cased character in S, 0 otherwise.";
				3566
				3567	static PyObject*
				3568	unicode_islower(PyUnicodeObject self, PyObject args)
				3569	{
				3570	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3571	register const Py_UNICODE *e;
				3572	int cased;
				3573
				3574	if (!PyArg_NoArgs(args))
				3575	return NULL;
				3576
				3577	/* Shortcut for single character strings */
				3578	if (PyUnicode_GET_SIZE(self) == 1)
				3579	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3580
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3581	/* Special case for empty strings */
				3582	if (PyString_GET_SIZE(self) == 0)
				3583	return PyInt_FromLong(0);
				3584
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3585	e = p + PyUnicode_GET_SIZE(self);
				3586	cased = 0;
				3587	for (; p < e; p++) {
				3588	register const Py_UNICODE ch = *p;
				3589
				3590	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3591	return PyInt_FromLong(0);
				3592	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3593	cased = 1;
				3594	}
				3595	return PyInt_FromLong(cased);
				3596	}
				3597
				3598	static char isupper__doc__[] =
				3599	"S.isupper() -> int\n\
				3600	\n\
				3601	Return 1 if all cased characters in S are uppercase and there is\n\
				3602	at least one cased character in S, 0 otherwise.";
				3603
				3604	static PyObject*
				3605	unicode_isupper(PyUnicodeObject self, PyObject args)
				3606	{
				3607	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3608	register const Py_UNICODE *e;
				3609	int cased;
				3610
				3611	if (!PyArg_NoArgs(args))
				3612	return NULL;
				3613
				3614	/* Shortcut for single character strings */
				3615	if (PyUnicode_GET_SIZE(self) == 1)
				3616	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3617
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3618	/* Special case for empty strings */
				3619	if (PyString_GET_SIZE(self) == 0)
				3620	return PyInt_FromLong(0);
				3621
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3622	e = p + PyUnicode_GET_SIZE(self);
				3623	cased = 0;
				3624	for (; p < e; p++) {
				3625	register const Py_UNICODE ch = *p;
				3626
				3627	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3628	return PyInt_FromLong(0);
				3629	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3630	cased = 1;
				3631	}
				3632	return PyInt_FromLong(cased);
				3633	}
				3634
				3635	static char istitle__doc__[] =
				3636	"S.istitle() -> int\n\
				3637	\n\
				3638	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3639	may only follow uncased characters and lowercase characters only cased\n\
				3640	ones. Return 0 otherwise.";
				3641
				3642	static PyObject*
				3643	unicode_istitle(PyUnicodeObject self, PyObject args)
				3644	{
				3645	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3646	register const Py_UNICODE *e;
				3647	int cased, previous_is_cased;
				3648
				3649	if (!PyArg_NoArgs(args))
				3650	return NULL;
				3651
				3652	/* Shortcut for single character strings */
				3653	if (PyUnicode_GET_SIZE(self) == 1)
				3654	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3655	(Py_UNICODE_ISUPPER(*p) != 0));
				3656
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3657	/* Special case for empty strings */
				3658	if (PyString_GET_SIZE(self) == 0)
				3659	return PyInt_FromLong(0);
				3660
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3661	e = p + PyUnicode_GET_SIZE(self);
				3662	cased = 0;
				3663	previous_is_cased = 0;
				3664	for (; p < e; p++) {
				3665	register const Py_UNICODE ch = *p;
				3666
				3667	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3668	if (previous_is_cased)
				3669	return PyInt_FromLong(0);
				3670	previous_is_cased = 1;
				3671	cased = 1;
				3672	}
				3673	else if (Py_UNICODE_ISLOWER(ch)) {
				3674	if (!previous_is_cased)
				3675	return PyInt_FromLong(0);
				3676	previous_is_cased = 1;
				3677	cased = 1;
				3678	}
				3679	else
				3680	previous_is_cased = 0;
				3681	}
				3682	return PyInt_FromLong(cased);
				3683	}
				3684
				3685	static char isspace__doc__[] =
				3686	"S.isspace() -> int\n\
				3687	\n\
				3688	Return 1 if there are only whitespace characters in S,\n\
				3689	0 otherwise.";
				3690
				3691	static PyObject*
				3692	unicode_isspace(PyUnicodeObject self, PyObject args)
				3693	{
				3694	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3695	register const Py_UNICODE *e;
				3696
				3697	if (!PyArg_NoArgs(args))
				3698	return NULL;
				3699
				3700	/* Shortcut for single character strings */
				3701	if (PyUnicode_GET_SIZE(self) == 1 &&
				3702	Py_UNICODE_ISSPACE(*p))
				3703	return PyInt_FromLong(1);
				3704
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3705	/* Special case for empty strings */
				3706	if (PyString_GET_SIZE(self) == 0)
				3707	return PyInt_FromLong(0);
				3708
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3709	e = p + PyUnicode_GET_SIZE(self);
				3710	for (; p < e; p++) {
				3711	if (!Py_UNICODE_ISSPACE(*p))
				3712	return PyInt_FromLong(0);
				3713	}
				3714	return PyInt_FromLong(1);
				3715	}
				3716
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3717	static char isalpha__doc__[] =
				3718	"S.isalpha() -> int\n\
				3719	\n\
				3720	Return 1 if all characters in S are alphabetic\n\
				3721	and there is at least one character in S, 0 otherwise.";
				3722
				3723	static PyObject*
				3724	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3725	{
				3726	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3727	register const Py_UNICODE *e;
				3728
				3729	if (!PyArg_NoArgs(args))
				3730	return NULL;
				3731
				3732	/* Shortcut for single character strings */
				3733	if (PyUnicode_GET_SIZE(self) == 1 &&
				3734	Py_UNICODE_ISALPHA(*p))
				3735	return PyInt_FromLong(1);
				3736
				3737	/* Special case for empty strings */
				3738	if (PyString_GET_SIZE(self) == 0)
				3739	return PyInt_FromLong(0);
				3740
				3741	e = p + PyUnicode_GET_SIZE(self);
				3742	for (; p < e; p++) {
				3743	if (!Py_UNICODE_ISALPHA(*p))
				3744	return PyInt_FromLong(0);
				3745	}
				3746	return PyInt_FromLong(1);
				3747	}
				3748
				3749	static char isalnum__doc__[] =
				3750	"S.isalnum() -> int\n\
				3751	\n\
				3752	Return 1 if all characters in S are alphanumeric\n\
				3753	and there is at least one character in S, 0 otherwise.";
				3754
				3755	static PyObject*
				3756	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3757	{
				3758	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3759	register const Py_UNICODE *e;
				3760
				3761	if (!PyArg_NoArgs(args))
				3762	return NULL;
				3763
				3764	/* Shortcut for single character strings */
				3765	if (PyUnicode_GET_SIZE(self) == 1 &&
				3766	Py_UNICODE_ISALNUM(*p))
				3767	return PyInt_FromLong(1);
				3768
				3769	/* Special case for empty strings */
				3770	if (PyString_GET_SIZE(self) == 0)
				3771	return PyInt_FromLong(0);
				3772
				3773	e = p + PyUnicode_GET_SIZE(self);
				3774	for (; p < e; p++) {
				3775	if (!Py_UNICODE_ISALNUM(*p))
				3776	return PyInt_FromLong(0);
				3777	}
				3778	return PyInt_FromLong(1);
				3779	}
				3780
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3781	static char isdecimal__doc__[] =
				3782	"S.isdecimal() -> int\n\
				3783	\n\
				3784	Return 1 if there are only decimal characters in S,\n\
				3785	0 otherwise.";
				3786
				3787	static PyObject*
				3788	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3789	{
				3790	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3791	register const Py_UNICODE *e;
				3792
				3793	if (!PyArg_NoArgs(args))
				3794	return NULL;
				3795
				3796	/* Shortcut for single character strings */
				3797	if (PyUnicode_GET_SIZE(self) == 1 &&
				3798	Py_UNICODE_ISDECIMAL(*p))
				3799	return PyInt_FromLong(1);
				3800
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3801	/* Special case for empty strings */
				3802	if (PyString_GET_SIZE(self) == 0)
				3803	return PyInt_FromLong(0);
				3804
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3805	e = p + PyUnicode_GET_SIZE(self);
				3806	for (; p < e; p++) {
				3807	if (!Py_UNICODE_ISDECIMAL(*p))
				3808	return PyInt_FromLong(0);
				3809	}
				3810	return PyInt_FromLong(1);
				3811	}
				3812
				3813	static char isdigit__doc__[] =
				3814	"S.isdigit() -> int\n\
				3815	\n\
				3816	Return 1 if there are only digit characters in S,\n\
				3817	0 otherwise.";
				3818
				3819	static PyObject*
				3820	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3821	{
				3822	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3823	register const Py_UNICODE *e;
				3824
				3825	if (!PyArg_NoArgs(args))
				3826	return NULL;
				3827
				3828	/* Shortcut for single character strings */
				3829	if (PyUnicode_GET_SIZE(self) == 1 &&
				3830	Py_UNICODE_ISDIGIT(*p))
				3831	return PyInt_FromLong(1);
				3832
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3833	/* Special case for empty strings */
				3834	if (PyString_GET_SIZE(self) == 0)
				3835	return PyInt_FromLong(0);
				3836
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3837	e = p + PyUnicode_GET_SIZE(self);
				3838	for (; p < e; p++) {
				3839	if (!Py_UNICODE_ISDIGIT(*p))
				3840	return PyInt_FromLong(0);
				3841	}
				3842	return PyInt_FromLong(1);
				3843	}
				3844
				3845	static char isnumeric__doc__[] =
				3846	"S.isnumeric() -> int\n\
				3847	\n\
				3848	Return 1 if there are only numeric characters in S,\n\
				3849	0 otherwise.";
				3850
				3851	static PyObject*
				3852	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3853	{
				3854	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3855	register const Py_UNICODE *e;
				3856
				3857	if (!PyArg_NoArgs(args))
				3858	return NULL;
				3859
				3860	/* Shortcut for single character strings */
				3861	if (PyUnicode_GET_SIZE(self) == 1 &&
				3862	Py_UNICODE_ISNUMERIC(*p))
				3863	return PyInt_FromLong(1);
				3864
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3865	/* Special case for empty strings */
				3866	if (PyString_GET_SIZE(self) == 0)
				3867	return PyInt_FromLong(0);
				3868
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3869	e = p + PyUnicode_GET_SIZE(self);
				3870	for (; p < e; p++) {
				3871	if (!Py_UNICODE_ISNUMERIC(*p))
				3872	return PyInt_FromLong(0);
				3873	}
				3874	return PyInt_FromLong(1);
				3875	}
				3876
				3877	static char join__doc__[] =
				3878	"S.join(sequence) -> unicode\n\
				3879	\n\
				3880	Return a string which is the concatenation of the strings in the\n\
				3881	sequence. The separator between elements is S.";
				3882
				3883	static PyObject*
				3884	unicode_join(PyUnicodeObject self, PyObject args)
				3885	{
				3886	PyObject *data;
				3887	if (!PyArg_ParseTuple(args, "O:join", &data))
				3888	return NULL;
				3889
				3890	return PyUnicode_Join((PyObject *)self, data);
				3891	}
				3892
				3893	static int
				3894	unicode_length(PyUnicodeObject *self)
				3895	{
				3896	return self->length;
				3897	}
				3898
				3899	static char ljust__doc__[] =
				3900	"S.ljust(width) -> unicode\n\
				3901	\n\
				3902	Return S left justified in a Unicode string of length width. Padding is\n\
				3903	done using spaces.";
				3904
				3905	static PyObject *
				3906	unicode_ljust(PyUnicodeObject self, PyObject args)
				3907	{
				3908	int width;
				3909	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3910	return NULL;
				3911
				3912	if (self->length >= width) {
				3913	Py_INCREF(self);
				3914	return (PyObject*) self;
				3915	}
				3916
				3917	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3918	}
				3919
				3920	static char lower__doc__[] =
				3921	"S.lower() -> unicode\n\
				3922	\n\
				3923	Return a copy of the string S converted to lowercase.";
				3924
				3925	static PyObject*
				3926	unicode_lower(PyUnicodeObject self, PyObject args)
				3927	{
				3928	if (!PyArg_NoArgs(args))
				3929	return NULL;
				3930	return fixup(self, fixlower);
				3931	}
				3932
				3933	static char lstrip__doc__[] =
				3934	"S.lstrip() -> unicode\n\
				3935	\n\
				3936	Return a copy of the string S with leading whitespace removed.";
				3937
				3938	static PyObject *
				3939	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3940	{
				3941	if (!PyArg_NoArgs(args))
				3942	return NULL;
				3943	return strip(self, 1, 0);
				3944	}
				3945
				3946	static PyObject*
				3947	unicode_repeat(PyUnicodeObject *str, int len)
				3948	{
				3949	PyUnicodeObject *u;
				3950	Py_UNICODE *p;
				3951
				3952	if (len < 0)
				3953	len = 0;
				3954
				3955	if (len == 1) {
				3956	/* no repeat, return original string */
				3957	Py_INCREF(str);
				3958	return (PyObject*) str;
				3959	}
				3960
				3961	u = _PyUnicode_New(len * str->length);
				3962	if (!u)
				3963	return NULL;
				3964
				3965	p = u->str;
				3966
				3967	while (len-- > 0) {
				3968	Py_UNICODE_COPY(p, str->str, str->length);
				3969	p += str->length;
				3970	}
				3971
				3972	return (PyObject*) u;
				3973	}
				3974
				3975	PyObject PyUnicode_Replace(PyObject obj,
				3976	PyObject *subobj,
				3977	PyObject *replobj,
				3978	int maxcount)
				3979	{
				3980	PyObject *self;
				3981	PyObject *str1;
				3982	PyObject *str2;
				3983	PyObject *result;
				3984
				3985	self = PyUnicode_FromObject(obj);
				3986	if (self == NULL)
				3987	return NULL;
				3988	str1 = PyUnicode_FromObject(subobj);
				3989	if (str1 == NULL) {
				3990	Py_DECREF(self);
				3991	return NULL;
				3992	}
				3993	str2 = PyUnicode_FromObject(replobj);
				3994	if (str2 == NULL) {
				3995	Py_DECREF(self);
				3996	Py_DECREF(str1);
				3997	return NULL;
				3998	}
				3999	result = replace((PyUnicodeObject *)self,
				4000	(PyUnicodeObject *)str1,
				4001	(PyUnicodeObject *)str2,
				4002	maxcount);
				4003	Py_DECREF(self);
				4004	Py_DECREF(str1);
				4005	Py_DECREF(str2);
				4006	return result;
				4007	}
				4008
				4009	static char replace__doc__[] =
				4010	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4011	\n\
				4012	Return a copy of S with all occurrences of substring\n\
				4013	old replaced by new. If the optional argument maxsplit is\n\
				4014	given, only the first maxsplit occurrences are replaced.";
				4015
				4016	static PyObject*
				4017	unicode_replace(PyUnicodeObject self, PyObject args)
				4018	{
				4019	PyUnicodeObject *str1;
				4020	PyUnicodeObject *str2;
				4021	int maxcount = -1;
				4022	PyObject *result;
				4023
				4024	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4025	return NULL;
				4026	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4027	if (str1 == NULL)
				4028	return NULL;
				4029	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4030	if (str2 == NULL)
				4031	return NULL;
				4032
				4033	result = replace(self, str1, str2, maxcount);
				4034
				4035	Py_DECREF(str1);
				4036	Py_DECREF(str2);
				4037	return result;
				4038	}
				4039
				4040	static
				4041	PyObject unicode_repr(PyObject unicode)
				4042	{
				4043	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4044	PyUnicode_GET_SIZE(unicode),
				4045	1);
				4046	}
				4047
				4048	static char rfind__doc__[] =
				4049	"S.rfind(sub [,start [,end]]) -> int\n\
				4050	\n\
				4051	Return the highest index in S where substring sub is found,\n\
				4052	such that sub is contained within s[start,end]. Optional\n\
				4053	arguments start and end are interpreted as in slice notation.\n\
				4054	\n\
				4055	Return -1 on failure.";
				4056
				4057	static PyObject *
				4058	unicode_rfind(PyUnicodeObject self, PyObject args)
				4059	{
				4060	PyUnicodeObject *substring;
				4061	int start = 0;
				4062	int end = INT_MAX;
				4063	PyObject *result;
				4064
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4065	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4066	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4067	return NULL;
				4068	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4069	(PyObject *)substring);
				4070	if (substring == NULL)
				4071	return NULL;
				4072
				4073	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4074
				4075	Py_DECREF(substring);
				4076	return result;
				4077	}
				4078
				4079	static char rindex__doc__[] =
				4080	"S.rindex(sub [,start [,end]]) -> int\n\
				4081	\n\
				4082	Like S.rfind() but raise ValueError when the substring is not found.";
				4083
				4084	static PyObject *
				4085	unicode_rindex(PyUnicodeObject self, PyObject args)
				4086	{
				4087	int result;
				4088	PyUnicodeObject *substring;
				4089	int start = 0;
				4090	int end = INT_MAX;
				4091
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4092	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4093	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4094	return NULL;
				4095	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4096	(PyObject *)substring);
				4097	if (substring == NULL)
				4098	return NULL;
				4099
				4100	result = findstring(self, substring, start, end, -1);
				4101
				4102	Py_DECREF(substring);
				4103	if (result < 0) {
				4104	PyErr_SetString(PyExc_ValueError, "substring not found");
				4105	return NULL;
				4106	}
				4107	return PyInt_FromLong(result);
				4108	}
				4109
				4110	static char rjust__doc__[] =
				4111	"S.rjust(width) -> unicode\n\
				4112	\n\
				4113	Return S right justified in a Unicode string of length width. Padding is\n\
				4114	done using spaces.";
				4115
				4116	static PyObject *
				4117	unicode_rjust(PyUnicodeObject self, PyObject args)
				4118	{
				4119	int width;
				4120	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4121	return NULL;
				4122
				4123	if (self->length >= width) {
				4124	Py_INCREF(self);
				4125	return (PyObject*) self;
				4126	}
				4127
				4128	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4129	}
				4130
				4131	static char rstrip__doc__[] =
				4132	"S.rstrip() -> unicode\n\
				4133	\n\
				4134	Return a copy of the string S with trailing whitespace removed.";
				4135
				4136	static PyObject *
				4137	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4138	{
				4139	if (!PyArg_NoArgs(args))
				4140	return NULL;
				4141	return strip(self, 0, 1);
				4142	}
				4143
				4144	static PyObject*
				4145	unicode_slice(PyUnicodeObject *self, int start, int end)
				4146	{
				4147	/* standard clamping */
				4148	if (start < 0)
				4149	start = 0;
				4150	if (end < 0)
				4151	end = 0;
				4152	if (end > self->length)
				4153	end = self->length;
				4154	if (start == 0 && end == self->length) {
				4155	/* full slice, return original string */
				4156	Py_INCREF(self);
				4157	return (PyObject*) self;
				4158	}
				4159	if (start > end)
				4160	start = end;
				4161	/* copy slice */
				4162	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4163	end - start);
				4164	}
				4165
				4166	PyObject PyUnicode_Split(PyObject s,
				4167	PyObject *sep,
				4168	int maxsplit)
				4169	{
				4170	PyObject *result;
				4171
				4172	s = PyUnicode_FromObject(s);
				4173	if (s == NULL)
				4174	return NULL;
				4175	if (sep != NULL) {
				4176	sep = PyUnicode_FromObject(sep);
				4177	if (sep == NULL) {
				4178	Py_DECREF(s);
				4179	return NULL;
				4180	}
				4181	}
				4182
				4183	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4184
				4185	Py_DECREF(s);
				4186	Py_XDECREF(sep);
				4187	return result;
				4188	}
				4189
				4190	static char split__doc__[] =
				4191	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4192	\n\
				4193	Return a list of the words in S, using sep as the\n\
				4194	delimiter string. If maxsplit is given, at most maxsplit\n\
				4195	splits are done. If sep is not specified, any whitespace string\n\
				4196	is a separator.";
				4197
				4198	static PyObject*
				4199	unicode_split(PyUnicodeObject self, PyObject args)
				4200	{
				4201	PyObject *substring = Py_None;
				4202	int maxcount = -1;
				4203
				4204	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4205	return NULL;
				4206
				4207	if (substring == Py_None)
				4208	return split(self, NULL, maxcount);
				4209	else if (PyUnicode_Check(substring))
				4210	return split(self, (PyUnicodeObject *)substring, maxcount);
				4211	else
				4212	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4213	}
				4214
				4215	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4216	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4217	\n\
				4218	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4219	Line breaks are not included in the resulting list unless keepends\n\
				4220	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4221
				4222	static PyObject*
				4223	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4224	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4225	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4226
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4227	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4228	return NULL;
				4229
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4230	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4231	}
				4232
				4233	static
				4234	PyObject unicode_str(PyUnicodeObject self)
				4235	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4236	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4237	}
				4238
				4239	static char strip__doc__[] =
				4240	"S.strip() -> unicode\n\
				4241	\n\
				4242	Return a copy of S with leading and trailing whitespace removed.";
				4243
				4244	static PyObject *
				4245	unicode_strip(PyUnicodeObject self, PyObject args)
				4246	{
				4247	if (!PyArg_NoArgs(args))
				4248	return NULL;
				4249	return strip(self, 1, 1);
				4250	}
				4251
				4252	static char swapcase__doc__[] =
				4253	"S.swapcase() -> unicode\n\
				4254	\n\
				4255	Return a copy of S with uppercase characters converted to lowercase\n\
				4256	and vice versa.";
				4257
				4258	static PyObject*
				4259	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4260	{
				4261	if (!PyArg_NoArgs(args))
				4262	return NULL;
				4263	return fixup(self, fixswapcase);
				4264	}
				4265
				4266	static char translate__doc__[] =
				4267	"S.translate(table) -> unicode\n\
				4268	\n\
				4269	Return a copy of the string S, where all characters have been mapped\n\
				4270	through the given translation table, which must be a mapping of\n\
				4271	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4272	are left untouched. Characters mapped to None are deleted.";
				4273
				4274	static PyObject*
				4275	unicode_translate(PyUnicodeObject self, PyObject args)
				4276	{
				4277	PyObject *table;
				4278
				4279	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4280	return NULL;
				4281	return PyUnicode_TranslateCharmap(self->str,
				4282	self->length,
				4283	table,
				4284	"ignore");
				4285	}
				4286
				4287	static char upper__doc__[] =
				4288	"S.upper() -> unicode\n\
				4289	\n\
				4290	Return a copy of S converted to uppercase.";
				4291
				4292	static PyObject*
				4293	unicode_upper(PyUnicodeObject self, PyObject args)
				4294	{
				4295	if (!PyArg_NoArgs(args))
				4296	return NULL;
				4297	return fixup(self, fixupper);
				4298	}
				4299
				4300	#if 0
				4301	static char zfill__doc__[] =
				4302	"S.zfill(width) -> unicode\n\
				4303	\n\
				4304	Pad a numeric string x with zeros on the left, to fill a field\n\
				4305	of the specified width. The string x is never truncated.";
				4306
				4307	static PyObject *
				4308	unicode_zfill(PyUnicodeObject self, PyObject args)
				4309	{
				4310	int fill;
				4311	PyUnicodeObject *u;
				4312
				4313	int width;
				4314	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4315	return NULL;
				4316
				4317	if (self->length >= width) {
				4318	Py_INCREF(self);
				4319	return (PyObject*) self;
				4320	}
				4321
				4322	fill = width - self->length;
				4323
				4324	u = pad(self, fill, 0, '0');
				4325
				4326	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4327	/* move sign to beginning of string */
				4328	u->str[0] = u->str[fill];
				4329	u->str[fill] = '0';
				4330	}
				4331
				4332	return (PyObject*) u;
				4333	}
				4334	#endif
				4335
				4336	#if 0
				4337	static PyObject*
				4338	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4339	{
				4340	if (!PyArg_NoArgs(args))
				4341	return NULL;
				4342	return PyInt_FromLong(unicode_freelist_size);
				4343	}
				4344	#endif
				4345
				4346	static char startswith__doc__[] =
				4347	"S.startswith(prefix[, start[, end]]) -> int\n\
				4348	\n\
				4349	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4350	optional start, test S beginning at that position. With optional end, stop\n\
				4351	comparing S at that position.";
				4352
				4353	static PyObject *
				4354	unicode_startswith(PyUnicodeObject *self,
				4355	PyObject *args)
				4356	{
				4357	PyUnicodeObject *substring;
				4358	int start = 0;
				4359	int end = INT_MAX;
				4360	PyObject *result;
				4361
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4362	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4363	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4364	return NULL;
				4365	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4366	(PyObject *)substring);
				4367	if (substring == NULL)
				4368	return NULL;
				4369
				4370	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4371
				4372	Py_DECREF(substring);
				4373	return result;
				4374	}
				4375
				4376
				4377	static char endswith__doc__[] =
				4378	"S.endswith(suffix[, start[, end]]) -> int\n\
				4379	\n\
				4380	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4381	optional start, test S beginning at that position. With optional end, stop\n\
				4382	comparing S at that position.";
				4383
				4384	static PyObject *
				4385	unicode_endswith(PyUnicodeObject *self,
				4386	PyObject *args)
				4387	{
				4388	PyUnicodeObject *substring;
				4389	int start = 0;
				4390	int end = INT_MAX;
				4391	PyObject *result;
				4392
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4393	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4394	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4395	return NULL;
				4396	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4397	(PyObject *)substring);
				4398	if (substring == NULL)
				4399	return NULL;
				4400
				4401	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4402
				4403	Py_DECREF(substring);
				4404	return result;
				4405	}
				4406
				4407
				4408	static PyMethodDef unicode_methods[] = {
				4409
				4410	/* Order is according to common usage: often used methods should
				4411	appear first, since lookup is done sequentially. */
				4412
				4413	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4414	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4415	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4416	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4417	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4418	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4419	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4420	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4421	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4422	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4423	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4424	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4425	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4426	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4427	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4428	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4429	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4430	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4431	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4432	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4433	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4434	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4435	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4436	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4437	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4438	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4439	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4440	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4441	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4442	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4443	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4444	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4445	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4446	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4447	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4448	#if 0
				4449	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4450	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4451	#endif
				4452
				4453	#if 0
				4454	/* This one is just used for debugging the implementation. */
				4455	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4456	#endif
				4457
				4458	{NULL, NULL}
				4459	};
				4460
				4461	static PyObject *
				4462	unicode_getattr(PyUnicodeObject self, char name)
				4463	{
				4464	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4465	}
				4466
				4467	static PySequenceMethods unicode_as_sequence = {
				4468	(inquiry) unicode_length, /* sq_length */
				4469	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4470	(intargfunc) unicode_repeat, /* sq_repeat */
				4471	(intargfunc) unicode_getitem, /* sq_item */
				4472	(intintargfunc) unicode_slice, /* sq_slice */
				4473	0, /* sq_ass_item */
				4474	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4475	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4476	};
				4477
				4478	static int
				4479	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4480	int index,
				4481	const void **ptr)
				4482	{
				4483	if (index != 0) {
				4484	PyErr_SetString(PyExc_SystemError,
				4485	"accessing non-existent unicode segment");
				4486	return -1;
				4487	}
				4488	ptr = (void ) self->str;
				4489	return PyUnicode_GET_DATA_SIZE(self);
				4490	}
				4491
				4492	static int
				4493	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4494	const void **ptr)
				4495	{
				4496	PyErr_SetString(PyExc_TypeError,
				4497	"cannot use unicode as modifyable buffer");
				4498	return -1;
				4499	}
				4500
				4501	static int
				4502	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4503	int *lenp)
				4504	{
				4505	if (lenp)
				4506	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4507	return 1;
				4508	}
				4509
				4510	static int
				4511	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4512	int index,
				4513	const void **ptr)
				4514	{
				4515	PyObject *str;
				4516
				4517	if (index != 0) {
				4518	PyErr_SetString(PyExc_SystemError,
				4519	"accessing non-existent unicode segment");
				4520	return -1;
				4521	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	4522	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4523	if (str == NULL)
				4524	return -1;
				4525	ptr = (void ) PyString_AS_STRING(str);
				4526	return PyString_GET_SIZE(str);
				4527	}
				4528
				4529	/* Helpers for PyUnicode_Format() */
				4530
				4531	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4532	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4533	{
				4534	int argidx = *p_argidx;
				4535	if (argidx < arglen) {
				4536	(*p_argidx)++;
				4537	if (arglen < 0)
				4538	return args;
				4539	else
				4540	return PyTuple_GetItem(args, argidx);
				4541	}
				4542	PyErr_SetString(PyExc_TypeError,
				4543	"not enough arguments for format string");
				4544	return NULL;
				4545	}
				4546
				4547	#define F_LJUST (1<<0)
				4548	#define F_SIGN (1<<1)
				4549	#define F_BLANK (1<<2)
				4550	#define F_ALT (1<<3)
				4551	#define F_ZERO (1<<4)
				4552
				4553	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4554	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4555	{
				4556	register int i;
				4557	int len;
				4558	va_list va;
				4559	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4560	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4561
				4562	/* First, format the string as char array, then expand to Py_UNICODE
				4563	array. */
				4564	charbuffer = (char *)buffer;
				4565	len = vsprintf(charbuffer, format, va);
				4566	for (i = len - 1; i >= 0; i--)
				4567	buffer[i] = (Py_UNICODE) charbuffer[i];
				4568
				4569	va_end(va);
				4570	return len;
				4571	}
				4572
				4573	static int
				4574	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4575	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4576	int flags,
				4577	int prec,
				4578	int type,
				4579	PyObject *v)
				4580	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4581	/* fmt = '%#.' + `prec` + `type`
				4582	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4583	char fmt[20];
				4584	double x;
				4585
				4586	x = PyFloat_AsDouble(v);
				4587	if (x == -1.0 && PyErr_Occurred())
				4588	return -1;
				4589	if (prec < 0)
				4590	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4591	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4592	type = 'g';
				4593	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4594	/* worst case length calc to ensure no buffer overrun:
				4595	fmt = %#.<prec>g
				4596	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4597	for any double rep.)
				4598	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4599	If prec=0 the effective precision is 1 (the leading digit is
				4600	always given), therefore increase by one to 10+prec. */
				4601	if (buflen <= (size_t)10 + (size_t)prec) {
				4602	PyErr_SetString(PyExc_OverflowError,
				4603	"formatted float is too long (precision too long?)");
				4604	return -1;
				4605	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4606	return usprintf(buf, fmt, x);
				4607	}
				4608
				4609	static int
				4610	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4611	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4612	int flags,
				4613	int prec,
				4614	int type,
				4615	PyObject *v)
				4616	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4617	/* fmt = '%#.' + `prec` + 'l' + `type`
				4618	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4619	char fmt[20];
				4620	long x;
				4621
				4622	x = PyInt_AsLong(v);
				4623	if (x == -1 && PyErr_Occurred())
				4624	return -1;
				4625	if (prec < 0)
				4626	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4627	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4628	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4629	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4630	PyErr_SetString(PyExc_OverflowError,
				4631	"formatted integer is too long (precision too long?)");
				4632	return -1;
				4633	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4634	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4635	return usprintf(buf, fmt, x);
				4636	}
				4637
				4638	static int
				4639	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4640	size_t buflen,
				4641	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4642	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4643	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4644	if (PyUnicode_Check(v)) {
				4645	if (PyUnicode_GET_SIZE(v) != 1)
				4646	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4647	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4648	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4649
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4650	else if (PyString_Check(v)) {
				4651	if (PyString_GET_SIZE(v) != 1)
				4652	goto onError;
				4653	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4654	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4655
				4656	else {
				4657	/* Integer input truncated to a character */
				4658	long x;
				4659	x = PyInt_AsLong(v);
				4660	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4661	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4662	buf[0] = (char) x;
				4663	}
				4664	buf[1] = '\0';
				4665	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4666
				4667	onError:
				4668	PyErr_SetString(PyExc_TypeError,
				4669	"%c requires int or char");
				4670	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4671	}
				4672
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4673	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4674
				4675	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4676	chars are formatted. XXX This is a magic number. Each formatting
				4677	routine does bounds checking to ensure no overflow, but a better
				4678	solution may be to malloc a buffer of appropriate size for each
				4679	format. For now, the current solution is sufficient.
				4680	*/
				4681	#define FORMATBUFLEN (size_t)120
				4682
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4683	PyObject PyUnicode_Format(PyObject format,
				4684	PyObject *args)
				4685	{
				4686	Py_UNICODE fmt, res;
				4687	int fmtcnt, rescnt, reslen, arglen, argidx;
				4688	int args_owned = 0;
				4689	PyUnicodeObject *result = NULL;
				4690	PyObject *dict = NULL;
				4691	PyObject *uformat;
				4692
				4693	if (format == NULL \|\| args == NULL) {
				4694	PyErr_BadInternalCall();
				4695	return NULL;
				4696	}
				4697	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4698	if (uformat == NULL)
				4699	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4700	fmt = PyUnicode_AS_UNICODE(uformat);
				4701	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4702
				4703	reslen = rescnt = fmtcnt + 100;
				4704	result = _PyUnicode_New(reslen);
				4705	if (result == NULL)
				4706	goto onError;
				4707	res = PyUnicode_AS_UNICODE(result);
				4708
				4709	if (PyTuple_Check(args)) {
				4710	arglen = PyTuple_Size(args);
				4711	argidx = 0;
				4712	}
				4713	else {
				4714	arglen = -1;
				4715	argidx = -2;
				4716	}
				4717	if (args->ob_type->tp_as_mapping)
				4718	dict = args;
				4719
				4720	while (--fmtcnt >= 0) {
				4721	if (*fmt != '%') {
				4722	if (--rescnt < 0) {
				4723	rescnt = fmtcnt + 100;
				4724	reslen += rescnt;
				4725	if (_PyUnicode_Resize(result, reslen) < 0)
				4726	return NULL;
				4727	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4728	--rescnt;
				4729	}
				4730	res++ = fmt++;
				4731	}
				4732	else {
				4733	/* Got a format specifier */
				4734	int flags = 0;
				4735	int width = -1;
				4736	int prec = -1;
				4737	int size = 0;
				4738	Py_UNICODE c = '\0';
				4739	Py_UNICODE fill;
				4740	PyObject *v = NULL;
				4741	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4742	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4743	Py_UNICODE sign;
				4744	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4745	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4746
				4747	fmt++;
				4748	if (*fmt == '(') {
				4749	Py_UNICODE *keystart;
				4750	int keylen;
				4751	PyObject *key;
				4752	int pcount = 1;
				4753
				4754	if (dict == NULL) {
				4755	PyErr_SetString(PyExc_TypeError,
				4756	"format requires a mapping");
				4757	goto onError;
				4758	}
				4759	++fmt;
				4760	--fmtcnt;
				4761	keystart = fmt;
				4762	/* Skip over balanced parentheses */
				4763	while (pcount > 0 && --fmtcnt >= 0) {
				4764	if (*fmt == ')')
				4765	--pcount;
				4766	else if (*fmt == '(')
				4767	++pcount;
				4768	fmt++;
				4769	}
				4770	keylen = fmt - keystart - 1;
				4771	if (fmtcnt < 0 \|\| pcount > 0) {
				4772	PyErr_SetString(PyExc_ValueError,
				4773	"incomplete format key");
				4774	goto onError;
				4775	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4776	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4777	then looked up since Python uses strings to hold
				4778	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4779	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4780	key = PyUnicode_EncodeUTF8(keystart,
				4781	keylen,
				4782	NULL);
				4783	if (key == NULL)
				4784	goto onError;
				4785	if (args_owned) {
				4786	Py_DECREF(args);
				4787	args_owned = 0;
				4788	}
				4789	args = PyObject_GetItem(dict, key);
				4790	Py_DECREF(key);
				4791	if (args == NULL) {
				4792	goto onError;
				4793	}
				4794	args_owned = 1;
				4795	arglen = -1;
				4796	argidx = -2;
				4797	}
				4798	while (--fmtcnt >= 0) {
				4799	switch (c = *fmt++) {
				4800	case '-': flags \|= F_LJUST; continue;
				4801	case '+': flags \|= F_SIGN; continue;
				4802	case ' ': flags \|= F_BLANK; continue;
				4803	case '#': flags \|= F_ALT; continue;
				4804	case '0': flags \|= F_ZERO; continue;
				4805	}
				4806	break;
				4807	}
				4808	if (c == '*') {
				4809	v = getnextarg(args, arglen, &argidx);
				4810	if (v == NULL)
				4811	goto onError;
				4812	if (!PyInt_Check(v)) {
				4813	PyErr_SetString(PyExc_TypeError,
				4814	"* wants int");
				4815	goto onError;
				4816	}
				4817	width = PyInt_AsLong(v);
				4818	if (width < 0) {
				4819	flags \|= F_LJUST;
				4820	width = -width;
				4821	}
				4822	if (--fmtcnt >= 0)
				4823	c = *fmt++;
				4824	}
				4825	else if (c >= '0' && c <= '9') {
				4826	width = c - '0';
				4827	while (--fmtcnt >= 0) {
				4828	c = *fmt++;
				4829	if (c < '0' \|\| c > '9')
				4830	break;
				4831	if ((width*10) / 10 != width) {
				4832	PyErr_SetString(PyExc_ValueError,
				4833	"width too big");
				4834	goto onError;
				4835	}
				4836	width = width*10 + (c - '0');
				4837	}
				4838	}
				4839	if (c == '.') {
				4840	prec = 0;
				4841	if (--fmtcnt >= 0)
				4842	c = *fmt++;
				4843	if (c == '*') {
				4844	v = getnextarg(args, arglen, &argidx);
				4845	if (v == NULL)
				4846	goto onError;
				4847	if (!PyInt_Check(v)) {
				4848	PyErr_SetString(PyExc_TypeError,
				4849	"* wants int");
				4850	goto onError;
				4851	}
				4852	prec = PyInt_AsLong(v);
				4853	if (prec < 0)
				4854	prec = 0;
				4855	if (--fmtcnt >= 0)
				4856	c = *fmt++;
				4857	}
				4858	else if (c >= '0' && c <= '9') {
				4859	prec = c - '0';
				4860	while (--fmtcnt >= 0) {
				4861	c = Py_CHARMASK(*fmt++);
				4862	if (c < '0' \|\| c > '9')
				4863	break;
				4864	if ((prec*10) / 10 != prec) {
				4865	PyErr_SetString(PyExc_ValueError,
				4866	"prec too big");
				4867	goto onError;
				4868	}
				4869	prec = prec*10 + (c - '0');
				4870	}
				4871	}
				4872	} /* prec */
				4873	if (fmtcnt >= 0) {
				4874	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4875	size = c;
				4876	if (--fmtcnt >= 0)
				4877	c = *fmt++;
				4878	}
				4879	}
				4880	if (fmtcnt < 0) {
				4881	PyErr_SetString(PyExc_ValueError,
				4882	"incomplete format");
				4883	goto onError;
				4884	}
				4885	if (c != '%') {
				4886	v = getnextarg(args, arglen, &argidx);
				4887	if (v == NULL)
				4888	goto onError;
				4889	}
				4890	sign = 0;
				4891	fill = ' ';
				4892	switch (c) {
				4893
				4894	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4895	pbuf = formatbuf;
				4896	/* presume that buffer length is at least 1 */
				4897	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4898	len = 1;
				4899	break;
				4900
				4901	case 's':
				4902	case 'r':
				4903	if (PyUnicode_Check(v) && c == 's') {
				4904	temp = v;
				4905	Py_INCREF(temp);
				4906	}
				4907	else {
				4908	PyObject *unicode;
				4909	if (c == 's')
				4910	temp = PyObject_Str(v);
				4911	else
				4912	temp = PyObject_Repr(v);
				4913	if (temp == NULL)
				4914	goto onError;
				4915	if (!PyString_Check(temp)) {
				4916	/* XXX Note: this should never happen, since
				4917	PyObject_Repr() and PyObject_Str() assure
				4918	this */
				4919	Py_DECREF(temp);
				4920	PyErr_SetString(PyExc_TypeError,
				4921	"%s argument has non-string str()");
				4922	goto onError;
				4923	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4924	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4925	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4926	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4927	"strict");
				4928	Py_DECREF(temp);
				4929	temp = unicode;
				4930	if (temp == NULL)
				4931	goto onError;
				4932	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4933	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4934	len = PyUnicode_GET_SIZE(temp);
				4935	if (prec >= 0 && len > prec)
				4936	len = prec;
				4937	break;
				4938
				4939	case 'i':
				4940	case 'd':
				4941	case 'u':
				4942	case 'o':
				4943	case 'x':
				4944	case 'X':
				4945	if (c == 'i')
				4946	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4947	pbuf = formatbuf;
				4948	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4949	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4950	if (len < 0)
				4951	goto onError;
				4952	sign = (c == 'd');
				4953	if (flags & F_ZERO) {
				4954	fill = '0';
				4955	if ((flags&F_ALT) &&
				4956	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4957	pbuf[0] == '0' && pbuf[1] == c) {
				4958	res++ = pbuf++;
				4959	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4960	rescnt -= 2;
				4961	len -= 2;
				4962	width -= 2;
				4963	if (width < 0)
				4964	width = 0;
				4965	}
				4966	}
				4967	break;
				4968
				4969	case 'e':
				4970	case 'E':
				4971	case 'f':
				4972	case 'g':
				4973	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4974	pbuf = formatbuf;
				4975	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4976	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4977	if (len < 0)
				4978	goto onError;
				4979	sign = 1;
				4980	if (flags&F_ZERO)
				4981	fill = '0';
				4982	break;
				4983
				4984	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4985	pbuf = formatbuf;
				4986	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4987	if (len < 0)
				4988	goto onError;
				4989	break;
				4990
				4991	default:
				4992	PyErr_Format(PyExc_ValueError,
				4993	"unsupported format character '%c' (0x%x)",
				4994	c, c);
				4995	goto onError;
				4996	}
				4997	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4998	if (pbuf == '-' \|\| pbuf == '+') {
				4999	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5000	len--;
				5001	}
				5002	else if (flags & F_SIGN)
				5003	sign = '+';
				5004	else if (flags & F_BLANK)
				5005	sign = ' ';
				5006	else
				5007	sign = 0;
				5008	}
				5009	if (width < len)
				5010	width = len;
				5011	if (rescnt < width + (sign != 0)) {
				5012	reslen -= rescnt;
				5013	rescnt = width + fmtcnt + 100;
				5014	reslen += rescnt;
				5015	if (_PyUnicode_Resize(result, reslen) < 0)
				5016	return NULL;
				5017	res = PyUnicode_AS_UNICODE(result)
				5018	+ reslen - rescnt;
				5019	}
				5020	if (sign) {
				5021	if (fill != ' ')
				5022	*res++ = sign;
				5023	rescnt--;
				5024	if (width > len)
				5025	width--;
				5026	}
				5027	if (width > len && !(flags & F_LJUST)) {
				5028	do {
				5029	--rescnt;
				5030	*res++ = fill;
				5031	} while (--width > len);
				5032	}
				5033	if (sign && fill == ' ')
				5034	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5035	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5036	res += len;
				5037	rescnt -= len;
				5038	while (--width >= len) {
				5039	--rescnt;
				5040	*res++ = ' ';
				5041	}
				5042	if (dict && (argidx < arglen) && c != '%') {
				5043	PyErr_SetString(PyExc_TypeError,
				5044	"not all arguments converted");
				5045	goto onError;
				5046	}
				5047	Py_XDECREF(temp);
				5048	} /* '%' */
				5049	} /* until end */
				5050	if (argidx < arglen && !dict) {
				5051	PyErr_SetString(PyExc_TypeError,
				5052	"not all arguments converted");
				5053	goto onError;
				5054	}
				5055
				5056	if (args_owned) {
				5057	Py_DECREF(args);
				5058	}
				5059	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5060	if (_PyUnicode_Resize(result, reslen - rescnt))
				5061	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5062	return (PyObject *)result;
				5063
				5064	onError:
				5065	Py_XDECREF(result);
				5066	Py_DECREF(uformat);
				5067	if (args_owned) {
				5068	Py_DECREF(args);
				5069	}
				5070	return NULL;
				5071	}
				5072
				5073	static PyBufferProcs unicode_as_buffer = {
				5074	(getreadbufferproc) unicode_buffer_getreadbuf,
				5075	(getwritebufferproc) unicode_buffer_getwritebuf,
				5076	(getsegcountproc) unicode_buffer_getsegcount,
				5077	(getcharbufferproc) unicode_buffer_getcharbuf,
				5078	};
				5079
				5080	PyTypeObject PyUnicode_Type = {
				5081	PyObject_HEAD_INIT(&PyType_Type)
				5082	0, /* ob_size */
				5083	"unicode", /* tp_name */
				5084	sizeof(PyUnicodeObject), /* tp_size */
				5085	0, /* tp_itemsize */
				5086	/* Slots */
				5087	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5088	0, /* tp_print */
				5089	(getattrfunc)unicode_getattr, /* tp_getattr */
				5090	0, /* tp_setattr */
				5091	(cmpfunc) unicode_compare, /* tp_compare */
				5092	(reprfunc) unicode_repr, /* tp_repr */
				5093	0, /* tp_as_number */
				5094	&unicode_as_sequence, /* tp_as_sequence */
				5095	0, /* tp_as_mapping */
				5096	(hashfunc) unicode_hash, /* tp_hash*/
				5097	0, /* tp_call*/
				5098	(reprfunc) unicode_str, /* tp_str */
				5099	(getattrofunc) NULL, /* tp_getattro */
				5100	(setattrofunc) NULL, /* tp_setattro */
				5101	&unicode_as_buffer, /* tp_as_buffer */
				5102	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5103	};
				5104
				5105	/* Initialize the Unicode implementation */
				5106
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5107	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5108	{
				5109	/* Doublecheck the configuration... */
				5110	if (sizeof(Py_UNICODE) != 2)
				5111	Py_FatalError("Unicode configuration error: "
				5112	"sizeof(Py_UNICODE) != 2 bytes");
				5113
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5114	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5115	unicode_freelist = NULL;
				5116	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5117	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5118	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5119	}
				5120
				5121	/* Finalize the Unicode implementation */
				5122
				5123	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5124	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5125	{
				5126	PyUnicodeObject *u = unicode_freelist;
				5127
				5128	while (u != NULL) {
				5129	PyUnicodeObject *v = u;
				5130	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5131	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5132	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame^]	5133	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5134	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5135	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5136	unicode_freelist = NULL;
				5137	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5138	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5139	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5140	}