Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 76bb92a117f041fde0169aea032fa07f680e2bfc [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	168	if (unicode->defenc) {
				169	Py_DECREF(unicode->defenc);
				170	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	216	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	217	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	219	/* Keep-Alive optimization: we only upsize the buffer,
				220	never downsize it. */
				221	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	223	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	224	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	227	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	229	}
				230	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	231	}
				232	else {
				233	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				234	if (unicode == NULL)
				235	return NULL;
				236	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				237	}
				238
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	239	if (!unicode->str) {
				240	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	241	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	unicode->str[length] = 0;
				244	unicode->length = length;
				245	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	246	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	248
				249	onError:
				250	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	251	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	252	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254
				255	static
				256	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				257	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	259	/* Keep-Alive optimization */
				260	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	261	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	unicode->str = NULL;
				263	unicode->length = 0;
				264	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	265	if (unicode->defenc) {
				266	Py_DECREF(unicode->defenc);
				267	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	268	}
				269	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	270	(PyUnicodeObject *)unicode = unicode_freelist;
				271	unicode_freelist = unicode;
				272	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	273	}
				274	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	275	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	276	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	277	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	278	}
				279	}
				280
				281	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				282	int size)
				283	{
				284	PyUnicodeObject *unicode;
				285
				286	unicode = _PyUnicode_New(size);
				287	if (!unicode)
				288	return NULL;
				289
				290	/* Copy the Unicode data into the new object */
				291	if (u != NULL)
				292	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	#ifdef HAVE_WCHAR_H
				298
				299	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				300	int size)
				301	{
				302	PyUnicodeObject *unicode;
				303
				304	if (w == NULL) {
				305	PyErr_BadInternalCall();
				306	return NULL;
				307	}
				308
				309	unicode = _PyUnicode_New(size);
				310	if (!unicode)
				311	return NULL;
				312
				313	/* Copy the wchar_t data into the new object */
				314	#ifdef HAVE_USABLE_WCHAR_T
				315	memcpy(unicode->str, w, size * sizeof(wchar_t));
				316	#else
				317	{
				318	register Py_UNICODE *u;
				319	register int i;
				320	u = PyUnicode_AS_UNICODE(unicode);
				321	for (i = size; i >= 0; i--)
				322	u++ = w++;
				323	}
				324	#endif
				325
				326	return (PyObject *)unicode;
				327	}
				328
				329	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				330	register wchar_t *w,
				331	int size)
				332	{
				333	if (unicode == NULL) {
				334	PyErr_BadInternalCall();
				335	return -1;
				336	}
				337	if (size > PyUnicode_GET_SIZE(unicode))
				338	size = PyUnicode_GET_SIZE(unicode);
				339	#ifdef HAVE_USABLE_WCHAR_T
				340	memcpy(w, unicode->str, size * sizeof(wchar_t));
				341	#else
				342	{
				343	register Py_UNICODE *u;
				344	register int i;
				345	u = PyUnicode_AS_UNICODE(unicode);
				346	for (i = size; i >= 0; i--)
				347	w++ = u++;
				348	}
				349	#endif
				350
				351	return size;
				352	}
				353
				354	#endif
				355
				356	PyObject PyUnicode_FromObject(register PyObject obj)
				357	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	358	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				359	}
				360
				361	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				362	const char *encoding,
				363	const char *errors)
				364	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	365	const char *s;
				366	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	367	int owned = 0;
				368	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	369
				370	if (obj == NULL) {
				371	PyErr_BadInternalCall();
				372	return NULL;
				373	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	374
				375	/* Coerce object */
				376	if (PyInstance_Check(obj)) {
				377	PyObject *func;
				378	func = PyObject_GetAttrString(obj, "__str__");
				379	if (func == NULL) {
				380	PyErr_SetString(PyExc_TypeError,
				381	"coercing to Unicode: instance doesn't define __str__");
				382	return NULL;
				383	}
				384	obj = PyEval_CallObject(func, NULL);
				385	Py_DECREF(func);
				386	if (obj == NULL)
				387	return NULL;
				388	owned = 1;
				389	}
				390	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	391	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	392	v = obj;
				393	if (encoding) {
				394	PyErr_SetString(PyExc_TypeError,
				395	"decoding Unicode is not supported");
				396	return NULL;
				397	}
				398	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399	}
				400	else if (PyString_Check(obj)) {
				401	s = PyString_AS_STRING(obj);
				402	len = PyString_GET_SIZE(obj);
				403	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	404	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				405	/* Overwrite the error message with something more useful in
				406	case of a TypeError. */
				407	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	408	PyErr_Format(PyExc_TypeError,
				409	"coercing to Unicode: need string or buffer, "
				410	"%.80s found",
				411	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414
				415	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	if (len == 0) {
				417	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	else
				421	v = PyUnicode_Decode(s, len, encoding, errors);
				422	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return v;
				427
				428	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	429	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	430	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	431	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	432	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	433	}
				434
				435	PyObject PyUnicode_Decode(const char s,
				436	int size,
				437	const char *encoding,
				438	const char *errors)
				439	{
				440	PyObject buffer = NULL, unicode;
				441
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	if (encoding == NULL)
				443	encoding = PyUnicode_GetDefaultEncoding();
				444
				445	/* Shortcuts for common default encodings */
				446	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	447	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	448	else if (strcmp(encoding, "latin-1") == 0)
				449	return PyUnicode_DecodeLatin1(s, size, errors);
				450	else if (strcmp(encoding, "ascii") == 0)
				451	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452
				453	/* Decode via the codec registry */
				454	buffer = PyBuffer_FromMemory((void *)s, size);
				455	if (buffer == NULL)
				456	goto onError;
				457	unicode = PyCodec_Decode(buffer, encoding, errors);
				458	if (unicode == NULL)
				459	goto onError;
				460	if (!PyUnicode_Check(unicode)) {
				461	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	462	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	463	unicode->ob_type->tp_name);
				464	Py_DECREF(unicode);
				465	goto onError;
				466	}
				467	Py_DECREF(buffer);
				468	return unicode;
				469
				470	onError:
				471	Py_XDECREF(buffer);
				472	return NULL;
				473	}
				474
				475	PyObject PyUnicode_Encode(const Py_UNICODE s,
				476	int size,
				477	const char *encoding,
				478	const char *errors)
				479	{
				480	PyObject v, unicode;
				481
				482	unicode = PyUnicode_FromUnicode(s, size);
				483	if (unicode == NULL)
				484	return NULL;
				485	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				486	Py_DECREF(unicode);
				487	return v;
				488	}
				489
				490	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				491	const char *encoding,
				492	const char *errors)
				493	{
				494	PyObject *v;
				495
				496	if (!PyUnicode_Check(unicode)) {
				497	PyErr_BadArgument();
				498	goto onError;
				499	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	500
				501	if (encoding == NULL)
				502	encoding = PyUnicode_GetDefaultEncoding();
				503
				504	/* Shortcuts for common default encodings */
				505	if (errors == NULL) {
				506	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	508	else if (strcmp(encoding, "latin-1") == 0)
				509	return PyUnicode_AsLatin1String(unicode);
				510	else if (strcmp(encoding, "ascii") == 0)
				511	return PyUnicode_AsASCIIString(unicode);
				512	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	513
				514	/* Encode via the codec registry */
				515	v = PyCodec_Encode(unicode, encoding, errors);
				516	if (v == NULL)
				517	goto onError;
				518	/* XXX Should we really enforce this ? */
				519	if (!PyString_Check(v)) {
				520	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	521	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	522	v->ob_type->tp_name);
				523	Py_DECREF(v);
				524	goto onError;
				525	}
				526	return v;
				527
				528	onError:
				529	return NULL;
				530	}
				531
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	532	/* Return a Python string holding the default encoded value of the
				533	Unicode object.
				534
				535	The resulting string is cached in the Unicode object for subsequent
				536	usage by this function. The cached version is needed to implement
				537	the character buffer interface and will live (at least) as long as
				538	the Unicode object itself.
				539
				540	The refcount of the string is not incremented.
				541
				542	* Exported for internal use by the interpreter only !!! *
				543
				544	*/
				545
				546	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				547	const char *errors)
				548	{
				549	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				550
				551	if (v)
				552	return v;
				553	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				554	if (v && errors == NULL)
				555	((PyUnicodeObject *)unicode)->defenc = v;
				556	return v;
				557	}
				558
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				560	{
				561	if (!PyUnicode_Check(unicode)) {
				562	PyErr_BadArgument();
				563	goto onError;
				564	}
				565	return PyUnicode_AS_UNICODE(unicode);
				566
				567	onError:
				568	return NULL;
				569	}
				570
				571	int PyUnicode_GetSize(PyObject *unicode)
				572	{
				573	if (!PyUnicode_Check(unicode)) {
				574	PyErr_BadArgument();
				575	goto onError;
				576	}
				577	return PyUnicode_GET_SIZE(unicode);
				578
				579	onError:
				580	return -1;
				581	}
				582
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	583	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	584	{
				585	return unicode_default_encoding;
				586	}
				587
				588	int PyUnicode_SetDefaultEncoding(const char *encoding)
				589	{
				590	PyObject *v;
				591
				592	/* Make sure the encoding is valid. As side effect, this also
				593	loads the encoding into the codec registry cache. */
				594	v = _PyCodec_Lookup(encoding);
				595	if (v == NULL)
				596	goto onError;
				597	Py_DECREF(v);
				598	strncpy(unicode_default_encoding,
				599	encoding,
				600	sizeof(unicode_default_encoding));
				601	return 0;
				602
				603	onError:
				604	return -1;
				605	}
				606
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	607	/* --- UTF-8 Codec -------------------------------------------------------- */
				608
				609	static
				610	char utf8_code_length[256] = {
				611	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				612	illegal prefix. see RFC 2279 for details */
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				616	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				617	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				618	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				619	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				620	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				621	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				622	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				623	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				624	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				625	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				626	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				627	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				628	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				629	};
				630
				631	static
				632	int utf8_decoding_error(const char **source,
				633	Py_UNICODE **dest,
				634	const char *errors,
				635	const char *details)
				636	{
				637	if ((errors == NULL) \|\|
				638	(strcmp(errors,"strict") == 0)) {
				639	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	640	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	641	details);
				642	return -1;
				643	}
				644	else if (strcmp(errors,"ignore") == 0) {
				645	(*source)++;
				646	return 0;
				647	}
				648	else if (strcmp(errors,"replace") == 0) {
				649	(*source)++;
				650	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				651	(*dest)++;
				652	return 0;
				653	}
				654	else {
				655	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	656	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	657	errors);
				658	return -1;
				659	}
				660	}
				661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	662	PyObject PyUnicode_DecodeUTF8(const char s,
				663	int size,
				664	const char *errors)
				665	{
				666	int n;
				667	const char *e;
				668	PyUnicodeObject *unicode;
				669	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	670	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	671
				672	/* Note: size will always be longer than the resulting Unicode
				673	character count */
				674	unicode = _PyUnicode_New(size);
				675	if (!unicode)
				676	return NULL;
				677	if (size == 0)
				678	return (PyObject *)unicode;
				679
				680	/* Unpack UTF-8 encoded data */
				681	p = unicode->str;
				682	e = s + size;
				683
				684	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	685	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686
				687	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	688	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	s++;
				690	continue;
				691	}
				692
				693	n = utf8_code_length[ch];
				694
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	695	if (s + n > e) {
				696	errmsg = "unexpected end of data";
				697	goto utf8Error;
				698	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699
				700	switch (n) {
				701
				702	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	703	errmsg = "unexpected code byte";
				704	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	705	break;
				706
				707	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	708	errmsg = "internal error";
				709	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	710	break;
				711
				712	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	if ((s[1] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	717	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	if (ch < 0x80) {
				719	errmsg = "illegal encoding";
				720	goto utf8Error;
				721	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	722	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	723	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	724	break;
				725
				726	case 3:
				727	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	728	(s[2] & 0xc0) != 0x80) {
				729	errmsg = "invalid data";
				730	goto utf8Error;
				731	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	732	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	733	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				734	errmsg = "illegal encoding";
				735	goto utf8Error;
				736	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	738	*p++ = (Py_UNICODE)ch;
				739	break;
				740
				741	case 4:
				742	if ((s[1] & 0xc0) != 0x80 \|\|
				743	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	(s[3] & 0xc0) != 0x80) {
				745	errmsg = "invalid data";
				746	goto utf8Error;
				747	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	748	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				749	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				750	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	751	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				752	byte encoding */
				753	(ch > 0x10ffff)) { /* maximum value allowed for
				754	UTF-16 */
				755	errmsg = "illegal encoding";
				756	goto utf8Error;
				757	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	758	/* compute and append the two surrogates: */
				759
				760	/* translate from 10000..10FFFF to 0..FFFF */
				761	ch -= 0x10000;
				762
				763	/* high surrogate = top 10 bits added to D800 */
				764	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				765
				766	/* low surrogate = bottom 10 bits added to DC00 */
				767	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	768	break;
				769
				770	default:
				771	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	772	errmsg = "unsupported Unicode code range";
				773	goto utf8Error;
				774	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775	}
				776	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	777	continue;
				778
				779	utf8Error:
				780	if (utf8_decoding_error(&s, &p, errors, errmsg))
				781	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	782	}
				783
				784	/* Adjust length */
				785	if (_PyUnicode_Resize(unicode, p - unicode->str))
				786	goto onError;
				787
				788	return (PyObject *)unicode;
				789
				790	onError:
				791	Py_DECREF(unicode);
				792	return NULL;
				793	}
				794
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	795	/* Not used anymore, now that the encoder supports UTF-16
				796	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	797	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	798	static
				799	int utf8_encoding_error(const Py_UNICODE **source,
				800	char **dest,
				801	const char *errors,
				802	const char *details)
				803	{
				804	if ((errors == NULL) \|\|
				805	(strcmp(errors,"strict") == 0)) {
				806	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	807	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	details);
				809	return -1;
				810	}
				811	else if (strcmp(errors,"ignore") == 0) {
				812	return 0;
				813	}
				814	else if (strcmp(errors,"replace") == 0) {
				815	**dest = '?';
				816	(*dest)++;
				817	return 0;
				818	}
				819	else {
				820	PyErr_Format(PyExc_ValueError,
				821	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	822	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	errors);
				824	return -1;
				825	}
				826	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	827	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	828
				829	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				830	int size,
				831	const char *errors)
				832	{
				833	PyObject *v;
				834	char *p;
				835	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	836	Py_UCS4 ch2;
				837	unsigned int cbAllocated = 3 * size;
				838	unsigned int cbWritten = 0;
				839	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	842	if (v == NULL)
				843	return NULL;
				844	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	845	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	846
				847	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	while (i < size) {
				849	Py_UCS4 ch = s[i++];
				850	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	851	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	852	cbWritten++;
				853	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	854	else if (ch < 0x0800) {
				855	*p++ = 0xc0 \| (ch >> 6);
				856	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	857	cbWritten += 2;
				858	}
				859	else {
				860	/* Check for high surrogate */
				861	if (0xD800 <= ch && ch <= 0xDBFF) {
				862	if (i != size) {
				863	ch2 = s[i];
				864	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				865
				866	if (cbWritten >= (cbAllocated - 4)) {
				867	/* Provide enough room for some more
				868	surrogates */
				869	cbAllocated += 4*10;
				870	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	871	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	872	}
				873
				874	/* combine the two values */
				875	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				876
				877	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	878	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	879	i++;
				880	cbWritten += 4;
				881	}
				882	}
				883	}
				884	else {
				885	*p++ = (char)(0xe0 \| (ch >> 12));
				886	cbWritten += 3;
				887	}
				888	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				889	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	}
				891	}
				892	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	893	if (_PyString_Resize(&v, p - q))
				894	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	895	return v;
				896
				897	onError:
				898	Py_DECREF(v);
				899	return NULL;
				900	}
				901
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	902	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				903	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	904	if (!PyUnicode_Check(unicode)) {
				905	PyErr_BadArgument();
				906	return NULL;
				907	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	908	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				909	PyUnicode_GET_SIZE(unicode),
				910	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	911	}
				912
				913	/* --- UTF-16 Codec ------------------------------------------------------- */
				914
				915	static
				916	int utf16_decoding_error(const Py_UNICODE **source,
				917	Py_UNICODE **dest,
				918	const char *errors,
				919	const char *details)
				920	{
				921	if ((errors == NULL) \|\|
				922	(strcmp(errors,"strict") == 0)) {
				923	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	924	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	925	details);
				926	return -1;
				927	}
				928	else if (strcmp(errors,"ignore") == 0) {
				929	return 0;
				930	}
				931	else if (strcmp(errors,"replace") == 0) {
				932	if (dest) {
				933	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				934	(*dest)++;
				935	}
				936	return 0;
				937	}
				938	else {
				939	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	940	"UTF-16 decoding error; "
				941	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	942	errors);
				943	return -1;
				944	}
				945	}
				946
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	947	PyObject PyUnicode_DecodeUTF16(const char s,
				948	int size,
				949	const char *errors,
				950	int *byteorder)
				951	{
				952	PyUnicodeObject *unicode;
				953	Py_UNICODE *p;
				954	const Py_UNICODE q, e;
				955	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	956	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	957
				958	/* size should be an even number */
				959	if (size % sizeof(Py_UNICODE) != 0) {
				960	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				961	return NULL;
				962	/* The remaining input chars are ignored if we fall through
				963	here... */
				964	}
				965
				966	/* Note: size will always be longer than the resulting Unicode
				967	character count */
				968	unicode = _PyUnicode_New(size);
				969	if (!unicode)
				970	return NULL;
				971	if (size == 0)
				972	return (PyObject *)unicode;
				973
				974	/* Unpack UTF-16 encoded data */
				975	p = unicode->str;
				976	q = (Py_UNICODE *)s;
				977	e = q + (size / sizeof(Py_UNICODE));
				978
				979	if (byteorder)
				980	bo = *byteorder;
				981
				982	while (q < e) {
				983	register Py_UNICODE ch = *q++;
				984
				985	/* Check for BOM marks (U+FEFF) in the input and adjust
				986	current byte order setting accordingly. Swap input
				987	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				988	!) */
				989	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				990	if (ch == 0xFEFF) {
				991	bo = -1;
				992	continue;
				993	} else if (ch == 0xFFFE) {
				994	bo = 1;
				995	continue;
				996	}
				997	if (bo == 1)
				998	ch = (ch >> 8) \| (ch << 8);
				999	#else
				1000	if (ch == 0xFEFF) {
				1001	bo = 1;
				1002	continue;
				1003	} else if (ch == 0xFFFE) {
				1004	bo = -1;
				1005	continue;
				1006	}
				1007	if (bo == -1)
				1008	ch = (ch >> 8) \| (ch << 8);
				1009	#endif
				1010	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1011	*p++ = ch;
				1012	continue;
				1013	}
				1014
				1015	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1016	if (q >= e) {
				1017	errmsg = "unexpected end of data";
				1018	goto utf16Error;
				1019	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1020	if (0xDC00 <= q && q <= 0xDFFF) {
				1021	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1022	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023	/* This is valid data (a UTF-16 surrogate pair), but
				1024	we are not able to store this information since our
				1025	Py_UNICODE type only has 16 bits... this might
				1026	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1027	errmsg = "code pairs are not supported";
				1028	goto utf16Error;
				1029	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1030	else
				1031	continue;
				1032	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1033	errmsg = "illegal encoding";
				1034	/* Fall through to report the error */
				1035
				1036	utf16Error:
				1037	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1038	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1039	}
				1040
				1041	if (byteorder)
				1042	*byteorder = bo;
				1043
				1044	/* Adjust length */
				1045	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1046	goto onError;
				1047
				1048	return (PyObject *)unicode;
				1049
				1050	onError:
				1051	Py_DECREF(unicode);
				1052	return NULL;
				1053	}
				1054
				1055	#undef UTF16_ERROR
				1056
				1057	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1058	int size,
				1059	const char *errors,
				1060	int byteorder)
				1061	{
				1062	PyObject *v;
				1063	Py_UNICODE *p;
				1064	char *q;
				1065
				1066	/* We don't create UTF-16 pairs... */
				1067	v = PyString_FromStringAndSize(NULL,
				1068	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1069	if (v == NULL)
				1070	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1071
				1072	q = PyString_AS_STRING(v);
				1073	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1074	if (byteorder == 0)
				1075	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1076	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1077	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1078	if (byteorder == 0 \|\|
				1079	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1080	byteorder == -1
				1081	#else
				1082	byteorder == 1
				1083	#endif
				1084	)
				1085	memcpy(p, s, size * sizeof(Py_UNICODE));
				1086	else
				1087	while (size-- > 0) {
				1088	Py_UNICODE ch = *s++;
				1089	*p++ = (ch >> 8) \| (ch << 8);
				1090	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1091	return v;
				1092	}
				1093
				1094	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1095	{
				1096	if (!PyUnicode_Check(unicode)) {
				1097	PyErr_BadArgument();
				1098	return NULL;
				1099	}
				1100	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1101	PyUnicode_GET_SIZE(unicode),
				1102	NULL,
				1103	0);
				1104	}
				1105
				1106	/* --- Unicode Escape Codec ----------------------------------------------- */
				1107
				1108	static
				1109	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1110	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1111	const char *errors,
				1112	const char *details)
				1113	{
				1114	if ((errors == NULL) \|\|
				1115	(strcmp(errors,"strict") == 0)) {
				1116	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1117	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1118	details);
				1119	return -1;
				1120	}
				1121	else if (strcmp(errors,"ignore") == 0) {
				1122	return 0;
				1123	}
				1124	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1125	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1126	return 0;
				1127	}
				1128	else {
				1129	PyErr_Format(PyExc_ValueError,
				1130	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1131	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	errors);
				1133	return -1;
				1134	}
				1135	}
				1136
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1137	static _Py_UCNHashAPI *pucnHash = NULL;
				1138
				1139	static
				1140	int mystrnicmp(const char s1, const char s2, size_t count)
				1141	{
				1142	char c1, c2;
				1143
				1144	if (count)
				1145	{
				1146	do
				1147	{
				1148	c1 = tolower(*(s1++));
				1149	c2 = tolower(*(s2++));
				1150	}
				1151	while(--count && c1 == c2);
				1152
				1153	return c1 - c2;
				1154	}
				1155
				1156	return 0;
				1157	}
				1158
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1160	int size,
				1161	const char *errors)
				1162	{
				1163	PyUnicodeObject *v;
				1164	Py_UNICODE p = NULL, buf = NULL;
				1165	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1166	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1167
				1168	/* Escaped strings will always be longer than the resulting
				1169	Unicode string, so we start with size here and then reduce the
				1170	length after conversion to the true value. */
				1171	v = _PyUnicode_New(size);
				1172	if (v == NULL)
				1173	goto onError;
				1174	if (size == 0)
				1175	return (PyObject *)v;
				1176	p = buf = PyUnicode_AS_UNICODE(v);
				1177	end = s + size;
				1178	while (s < end) {
				1179	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1180	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1181	int i;
				1182
				1183	/* Non-escape characters are interpreted as Unicode ordinals */
				1184	if (*s != '\\') {
				1185	p++ = (unsigned char)s++;
				1186	continue;
				1187	}
				1188
				1189	/* \ - Escapes */
				1190	s++;
				1191	switch (*s++) {
				1192
				1193	/* \x escapes */
				1194	case '\n': break;
				1195	case '\\': *p++ = '\\'; break;
				1196	case '\'': *p++ = '\''; break;
				1197	case '\"': *p++ = '\"'; break;
				1198	case 'b': *p++ = '\b'; break;
				1199	case 'f': p++ = '\014'; break; / FF */
				1200	case 't': *p++ = '\t'; break;
				1201	case 'n': *p++ = '\n'; break;
				1202	case 'r': *p++ = '\r'; break;
				1203	case 'v': p++ = '\013'; break; / VT */
				1204	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1205
				1206	/* \OOO (octal) escapes */
				1207	case '0': case '1': case '2': case '3':
				1208	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1209	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1211	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1212	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1213	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1214	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1215	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1216	break;
				1217
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1218	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1219	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1220	for (x = 0, i = 0; i < 2; i++) {
				1221	c = (unsigned char)s[i];
				1222	if (!isxdigit(c)) {
				1223	if (unicodeescape_decoding_error(&s, &x, errors,
				1224	"truncated \\xXX"))
				1225	goto onError;
				1226	i++;
				1227	break;
				1228	}
				1229	x = (x<<4) & ~0xF;
				1230	if (c >= '0' && c <= '9')
				1231	x += c - '0';
				1232	else if (c >= 'a' && c <= 'f')
				1233	x += 10 + c - 'a';
				1234	else
				1235	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1236	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1237	s += i;
				1238	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1239	break;
				1240
				1241	/* \uXXXX with 4 hex digits */
				1242	case 'u':
				1243	for (x = 0, i = 0; i < 4; i++) {
				1244	c = (unsigned char)s[i];
				1245	if (!isxdigit(c)) {
				1246	if (unicodeescape_decoding_error(&s, &x, errors,
				1247	"truncated \\uXXXX"))
				1248	goto onError;
				1249	i++;
				1250	break;
				1251	}
				1252	x = (x<<4) & ~0xF;
				1253	if (c >= '0' && c <= '9')
				1254	x += c - '0';
				1255	else if (c >= 'a' && c <= 'f')
				1256	x += 10 + c - 'a';
				1257	else
				1258	x += 10 + c - 'A';
				1259	}
				1260	s += i;
				1261	*p++ = x;
				1262	break;
				1263
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1264	/* \UXXXXXXXX with 8 hex digits */
				1265	case 'U':
				1266	for (chr = 0, i = 0; i < 8; i++) {
				1267	c = (unsigned char)s[i];
				1268	if (!isxdigit(c)) {
				1269	if (unicodeescape_decoding_error(&s, &x, errors,
				1270	"truncated \\uXXXX"))
				1271	goto onError;
				1272	i++;
				1273	break;
				1274	}
				1275	chr = (chr<<4) & ~0xF;
				1276	if (c >= '0' && c <= '9')
				1277	chr += c - '0';
				1278	else if (c >= 'a' && c <= 'f')
				1279	chr += 10 + c - 'a';
				1280	else
				1281	chr += 10 + c - 'A';
				1282	}
				1283	s += i;
				1284	goto store;
				1285
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1286	case 'N':
				1287	/* Ok, we need to deal with Unicode Character Names now,
				1288	* make sure we've imported the hash table data...
				1289	*/
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1290	if (pucnHash == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1291	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1292	mod = PyImport_ImportModule("ucnhash");
				1293	if (mod == NULL)
				1294	goto onError;
				1295	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1296	Py_DECREF(mod);
				1297	if (v == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1298	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1299	pucnHash = PyCObject_AsVoidPtr(v);
				1300	Py_DECREF(v);
				1301	if (pucnHash == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1302	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1303	}
				1304
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1305	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1306	const char *start = s + 1;
				1307	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1308	unsigned long j;
				1309
				1310	/* look for either the closing brace, or we
				1311	* exceed the maximum length of the unicode character names
				1312	*/
				1313	while (*endBrace != '}' &&
				1314	(unsigned int)(endBrace - start) <=
				1315	pucnHash->cchMax &&
				1316	endBrace < end)
				1317	{
				1318	endBrace++;
				1319	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1320	if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1321	j = pucnHash->hash(start, endBrace - start);
				1322	if (j > pucnHash->cKeys \|\|
				1323	mystrnicmp(
				1324	start,
				1325	((_Py_UnicodeCharacterName *)
				1326	(pucnHash->getValue(j)))->pszUCN,
				1327	(int)(endBrace - start)) != 0)
				1328	{
				1329	if (unicodeescape_decoding_error(
				1330	&s, &x, errors,
				1331	"Invalid Unicode Character Name"))
				1332	{
				1333	goto onError;
				1334	}
				1335	goto ucnFallthrough;
				1336	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1337	chr = ((_Py_UnicodeCharacterName *)
				1338	(pucnHash->getValue(j)))->value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1339	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1340	goto store;
				1341	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1342	if (unicodeescape_decoding_error(
				1343	&s, &x, errors,
				1344	"Unicode name missing closing brace"))
				1345	goto onError;
				1346	goto ucnFallthrough;
				1347	}
				1348	break;
				1349	}
				1350	if (unicodeescape_decoding_error(
				1351	&s, &x, errors,
				1352	"Missing opening brace for Unicode Character Name escape"))
				1353	goto onError;
				1354	ucnFallthrough:
				1355	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1356	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1357	*p++ = '\\';
				1358	*p++ = (unsigned char)s[-1];
				1359	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1360	store:
				1361	/* when we get here, chr is a 32-bit unicode character */
				1362	if (chr <= 0xffff)
				1363	/* UCS-2 character */
				1364	*p++ = (Py_UNICODE) chr;
				1365	else if (chr <= 0x10ffff) {
				1366	/* UCS-4 character. store as two surrogate characters */
				1367	chr -= 0x10000L;
				1368	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1369	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1370	} else {
				1371	if (unicodeescape_decoding_error(
				1372	&s, &x, errors,
				1373	"Illegal Unicode character")
				1374	)
				1375	goto onError;
				1376	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1377	}
				1378	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1379	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1380	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1381	return (PyObject *)v;
				1382
				1383	onError:
				1384	Py_XDECREF(v);
				1385	return NULL;
				1386	}
				1387
				1388	/* Return a Unicode-Escape string version of the Unicode object.
				1389
				1390	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1391	appropriate.
				1392
				1393	*/
				1394
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1395	static const Py_UNICODE findchar(const Py_UNICODE s,
				1396	int size,
				1397	Py_UNICODE ch);
				1398
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1399	static
				1400	PyObject unicodeescape_string(const Py_UNICODE s,
				1401	int size,
				1402	int quotes)
				1403	{
				1404	PyObject *repr;
				1405	char *p;
				1406	char *q;
				1407
				1408	static const char *hexdigit = "0123456789ABCDEF";
				1409
				1410	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1411	if (repr == NULL)
				1412	return NULL;
				1413
				1414	p = q = PyString_AS_STRING(repr);
				1415
				1416	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1417	*p++ = 'u';
				1418	*p++ = (findchar(s, size, '\'') &&
				1419	!findchar(s, size, '"')) ? '"' : '\'';
				1420	}
				1421	while (size-- > 0) {
				1422	Py_UNICODE ch = *s++;
				1423	/* Escape quotes */
				1424	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1425	*p++ = '\\';
				1426	*p++ = (char) ch;
				1427	}
				1428	/* Map 16-bit characters to '\uxxxx' */
				1429	else if (ch >= 256) {
				1430	*p++ = '\\';
				1431	*p++ = 'u';
				1432	*p++ = hexdigit[(ch >> 12) & 0xf];
				1433	*p++ = hexdigit[(ch >> 8) & 0xf];
				1434	*p++ = hexdigit[(ch >> 4) & 0xf];
				1435	*p++ = hexdigit[ch & 15];
				1436	}
				1437	/* Map non-printable US ASCII to '\ooo' */
				1438	else if (ch < ' ' \|\| ch >= 128) {
				1439	*p++ = '\\';
				1440	*p++ = hexdigit[(ch >> 6) & 7];
				1441	*p++ = hexdigit[(ch >> 3) & 7];
				1442	*p++ = hexdigit[ch & 7];
				1443	}
				1444	/* Copy everything else as-is */
				1445	else
				1446	*p++ = (char) ch;
				1447	}
				1448	if (quotes)
				1449	*p++ = q[1];
				1450
				1451	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1452	if (_PyString_Resize(&repr, p - q))
				1453	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1454
				1455	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1456
				1457	onError:
				1458	Py_DECREF(repr);
				1459	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1460	}
				1461
				1462	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1463	int size)
				1464	{
				1465	return unicodeescape_string(s, size, 0);
				1466	}
				1467
				1468	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1469	{
				1470	if (!PyUnicode_Check(unicode)) {
				1471	PyErr_BadArgument();
				1472	return NULL;
				1473	}
				1474	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1475	PyUnicode_GET_SIZE(unicode));
				1476	}
				1477
				1478	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1479
				1480	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1481	int size,
				1482	const char *errors)
				1483	{
				1484	PyUnicodeObject *v;
				1485	Py_UNICODE p, buf;
				1486	const char *end;
				1487	const char *bs;
				1488
				1489	/* Escaped strings will always be longer than the resulting
				1490	Unicode string, so we start with size here and then reduce the
				1491	length after conversion to the true value. */
				1492	v = _PyUnicode_New(size);
				1493	if (v == NULL)
				1494	goto onError;
				1495	if (size == 0)
				1496	return (PyObject *)v;
				1497	p = buf = PyUnicode_AS_UNICODE(v);
				1498	end = s + size;
				1499	while (s < end) {
				1500	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1501	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1502	int i;
				1503
				1504	/* Non-escape characters are interpreted as Unicode ordinals */
				1505	if (*s != '\\') {
				1506	p++ = (unsigned char)s++;
				1507	continue;
				1508	}
				1509
				1510	/* \u-escapes are only interpreted iff the number of leading
				1511	backslashes if odd */
				1512	bs = s;
				1513	for (;s < end;) {
				1514	if (*s != '\\')
				1515	break;
				1516	p++ = (unsigned char)s++;
				1517	}
				1518	if (((s - bs) & 1) == 0 \|\|
				1519	s >= end \|\|
				1520	*s != 'u') {
				1521	continue;
				1522	}
				1523	p--;
				1524	s++;
				1525
				1526	/* \uXXXX with 4 hex digits */
				1527	for (x = 0, i = 0; i < 4; i++) {
				1528	c = (unsigned char)s[i];
				1529	if (!isxdigit(c)) {
				1530	if (unicodeescape_decoding_error(&s, &x, errors,
				1531	"truncated \\uXXXX"))
				1532	goto onError;
				1533	i++;
				1534	break;
				1535	}
				1536	x = (x<<4) & ~0xF;
				1537	if (c >= '0' && c <= '9')
				1538	x += c - '0';
				1539	else if (c >= 'a' && c <= 'f')
				1540	x += 10 + c - 'a';
				1541	else
				1542	x += 10 + c - 'A';
				1543	}
				1544	s += i;
				1545	*p++ = x;
				1546	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1547	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1548	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1549	return (PyObject *)v;
				1550
				1551	onError:
				1552	Py_XDECREF(v);
				1553	return NULL;
				1554	}
				1555
				1556	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1557	int size)
				1558	{
				1559	PyObject *repr;
				1560	char *p;
				1561	char *q;
				1562
				1563	static const char *hexdigit = "0123456789ABCDEF";
				1564
				1565	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1566	if (repr == NULL)
				1567	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1568	if (size == 0)
				1569	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1570
				1571	p = q = PyString_AS_STRING(repr);
				1572	while (size-- > 0) {
				1573	Py_UNICODE ch = *s++;
				1574	/* Map 16-bit characters to '\uxxxx' */
				1575	if (ch >= 256) {
				1576	*p++ = '\\';
				1577	*p++ = 'u';
				1578	*p++ = hexdigit[(ch >> 12) & 0xf];
				1579	*p++ = hexdigit[(ch >> 8) & 0xf];
				1580	*p++ = hexdigit[(ch >> 4) & 0xf];
				1581	*p++ = hexdigit[ch & 15];
				1582	}
				1583	/* Copy everything else as-is */
				1584	else
				1585	*p++ = (char) ch;
				1586	}
				1587	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1588	if (_PyString_Resize(&repr, p - q))
				1589	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590
				1591	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1592
				1593	onError:
				1594	Py_DECREF(repr);
				1595	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1596	}
				1597
				1598	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1599	{
				1600	if (!PyUnicode_Check(unicode)) {
				1601	PyErr_BadArgument();
				1602	return NULL;
				1603	}
				1604	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1605	PyUnicode_GET_SIZE(unicode));
				1606	}
				1607
				1608	/* --- Latin-1 Codec ------------------------------------------------------ */
				1609
				1610	PyObject PyUnicode_DecodeLatin1(const char s,
				1611	int size,
				1612	const char *errors)
				1613	{
				1614	PyUnicodeObject *v;
				1615	Py_UNICODE *p;
				1616
				1617	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1618	v = _PyUnicode_New(size);
				1619	if (v == NULL)
				1620	goto onError;
				1621	if (size == 0)
				1622	return (PyObject *)v;
				1623	p = PyUnicode_AS_UNICODE(v);
				1624	while (size-- > 0)
				1625	p++ = (unsigned char)s++;
				1626	return (PyObject *)v;
				1627
				1628	onError:
				1629	Py_XDECREF(v);
				1630	return NULL;
				1631	}
				1632
				1633	static
				1634	int latin1_encoding_error(const Py_UNICODE **source,
				1635	char **dest,
				1636	const char *errors,
				1637	const char *details)
				1638	{
				1639	if ((errors == NULL) \|\|
				1640	(strcmp(errors,"strict") == 0)) {
				1641	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1642	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1643	details);
				1644	return -1;
				1645	}
				1646	else if (strcmp(errors,"ignore") == 0) {
				1647	return 0;
				1648	}
				1649	else if (strcmp(errors,"replace") == 0) {
				1650	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1651	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1652	return 0;
				1653	}
				1654	else {
				1655	PyErr_Format(PyExc_ValueError,
				1656	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1657	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1658	errors);
				1659	return -1;
				1660	}
				1661	}
				1662
				1663	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1664	int size,
				1665	const char *errors)
				1666	{
				1667	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1668	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1669
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1670	repr = PyString_FromStringAndSize(NULL, size);
				1671	if (repr == NULL)
				1672	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1673	if (size == 0)
				1674	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1675
				1676	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1677	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1678	while (size-- > 0) {
				1679	Py_UNICODE ch = *p++;
				1680	if (ch >= 256) {
				1681	if (latin1_encoding_error(&p, &s, errors,
				1682	"ordinal not in range(256)"))
				1683	goto onError;
				1684	}
				1685	else
				1686	*s++ = (char)ch;
				1687	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1688	/* Resize if error handling skipped some characters */
				1689	if (s - start < PyString_GET_SIZE(repr))
				1690	if (_PyString_Resize(&repr, s - start))
				1691	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1692	return repr;
				1693
				1694	onError:
				1695	Py_DECREF(repr);
				1696	return NULL;
				1697	}
				1698
				1699	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1700	{
				1701	if (!PyUnicode_Check(unicode)) {
				1702	PyErr_BadArgument();
				1703	return NULL;
				1704	}
				1705	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1706	PyUnicode_GET_SIZE(unicode),
				1707	NULL);
				1708	}
				1709
				1710	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1711
				1712	static
				1713	int ascii_decoding_error(const char **source,
				1714	Py_UNICODE **dest,
				1715	const char *errors,
				1716	const char *details)
				1717	{
				1718	if ((errors == NULL) \|\|
				1719	(strcmp(errors,"strict") == 0)) {
				1720	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1721	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1722	details);
				1723	return -1;
				1724	}
				1725	else if (strcmp(errors,"ignore") == 0) {
				1726	return 0;
				1727	}
				1728	else if (strcmp(errors,"replace") == 0) {
				1729	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1730	(*dest)++;
				1731	return 0;
				1732	}
				1733	else {
				1734	PyErr_Format(PyExc_ValueError,
				1735	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1736	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1737	errors);
				1738	return -1;
				1739	}
				1740	}
				1741
				1742	PyObject PyUnicode_DecodeASCII(const char s,
				1743	int size,
				1744	const char *errors)
				1745	{
				1746	PyUnicodeObject *v;
				1747	Py_UNICODE *p;
				1748
				1749	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1750	v = _PyUnicode_New(size);
				1751	if (v == NULL)
				1752	goto onError;
				1753	if (size == 0)
				1754	return (PyObject *)v;
				1755	p = PyUnicode_AS_UNICODE(v);
				1756	while (size-- > 0) {
				1757	register unsigned char c;
				1758
				1759	c = (unsigned char)*s++;
				1760	if (c < 128)
				1761	*p++ = c;
				1762	else if (ascii_decoding_error(&s, &p, errors,
				1763	"ordinal not in range(128)"))
				1764	goto onError;
				1765	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1766	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1767	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1768	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1769	return (PyObject *)v;
				1770
				1771	onError:
				1772	Py_XDECREF(v);
				1773	return NULL;
				1774	}
				1775
				1776	static
				1777	int ascii_encoding_error(const Py_UNICODE **source,
				1778	char **dest,
				1779	const char *errors,
				1780	const char *details)
				1781	{
				1782	if ((errors == NULL) \|\|
				1783	(strcmp(errors,"strict") == 0)) {
				1784	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1785	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1786	details);
				1787	return -1;
				1788	}
				1789	else if (strcmp(errors,"ignore") == 0) {
				1790	return 0;
				1791	}
				1792	else if (strcmp(errors,"replace") == 0) {
				1793	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1794	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	return 0;
				1796	}
				1797	else {
				1798	PyErr_Format(PyExc_ValueError,
				1799	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1800	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1801	errors);
				1802	return -1;
				1803	}
				1804	}
				1805
				1806	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1807	int size,
				1808	const char *errors)
				1809	{
				1810	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1811	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1812
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1813	repr = PyString_FromStringAndSize(NULL, size);
				1814	if (repr == NULL)
				1815	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1816	if (size == 0)
				1817	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1818
				1819	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1820	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1821	while (size-- > 0) {
				1822	Py_UNICODE ch = *p++;
				1823	if (ch >= 128) {
				1824	if (ascii_encoding_error(&p, &s, errors,
				1825	"ordinal not in range(128)"))
				1826	goto onError;
				1827	}
				1828	else
				1829	*s++ = (char)ch;
				1830	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1831	/* Resize if error handling skipped some characters */
				1832	if (s - start < PyString_GET_SIZE(repr))
				1833	if (_PyString_Resize(&repr, s - start))
				1834	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1835	return repr;
				1836
				1837	onError:
				1838	Py_DECREF(repr);
				1839	return NULL;
				1840	}
				1841
				1842	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1843	{
				1844	if (!PyUnicode_Check(unicode)) {
				1845	PyErr_BadArgument();
				1846	return NULL;
				1847	}
				1848	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1849	PyUnicode_GET_SIZE(unicode),
				1850	NULL);
				1851	}
				1852
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1853	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1854
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1855	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1856
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1857	PyObject PyUnicode_DecodeMBCS(const char s,
				1858	int size,
				1859	const char *errors)
				1860	{
				1861	PyUnicodeObject *v;
				1862	Py_UNICODE *p;
				1863
				1864	/* First get the size of the result */
				1865	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1866	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1867	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1868
				1869	v = _PyUnicode_New(usize);
				1870	if (v == NULL)
				1871	return NULL;
				1872	if (usize == 0)
				1873	return (PyObject *)v;
				1874	p = PyUnicode_AS_UNICODE(v);
				1875	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1876	Py_DECREF(v);
				1877	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1878	}
				1879
				1880	return (PyObject *)v;
				1881	}
				1882
				1883	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1884	int size,
				1885	const char *errors)
				1886	{
				1887	PyObject *repr;
				1888	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1889	DWORD mbcssize;
				1890
				1891	/* If there are no characters, bail now! */
				1892	if (size==0)
				1893	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1894
				1895	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1896	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1897	if (mbcssize==0)
				1898	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1899
				1900	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1901	if (repr == NULL)
				1902	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1903	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1904	return repr;
				1905
				1906	/* Do the conversion */
				1907	s = PyString_AS_STRING(repr);
				1908	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1909	Py_DECREF(repr);
				1910	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1911	}
				1912	return repr;
				1913	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1914
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1915	#endif /* MS_WIN32 */
				1916
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1917	/* --- Character Mapping Codec -------------------------------------------- */
				1918
				1919	static
				1920	int charmap_decoding_error(const char **source,
				1921	Py_UNICODE **dest,
				1922	const char *errors,
				1923	const char *details)
				1924	{
				1925	if ((errors == NULL) \|\|
				1926	(strcmp(errors,"strict") == 0)) {
				1927	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1928	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1929	details);
				1930	return -1;
				1931	}
				1932	else if (strcmp(errors,"ignore") == 0) {
				1933	return 0;
				1934	}
				1935	else if (strcmp(errors,"replace") == 0) {
				1936	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1937	(*dest)++;
				1938	return 0;
				1939	}
				1940	else {
				1941	PyErr_Format(PyExc_ValueError,
				1942	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1943	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1944	errors);
				1945	return -1;
				1946	}
				1947	}
				1948
				1949	PyObject PyUnicode_DecodeCharmap(const char s,
				1950	int size,
				1951	PyObject *mapping,
				1952	const char *errors)
				1953	{
				1954	PyUnicodeObject *v;
				1955	Py_UNICODE *p;
				1956
				1957	/* Default to Latin-1 */
				1958	if (mapping == NULL)
				1959	return PyUnicode_DecodeLatin1(s, size, errors);
				1960
				1961	v = _PyUnicode_New(size);
				1962	if (v == NULL)
				1963	goto onError;
				1964	if (size == 0)
				1965	return (PyObject *)v;
				1966	p = PyUnicode_AS_UNICODE(v);
				1967	while (size-- > 0) {
				1968	unsigned char ch = *s++;
				1969	PyObject w, x;
				1970
				1971	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1972	w = PyInt_FromLong((long)ch);
				1973	if (w == NULL)
				1974	goto onError;
				1975	x = PyObject_GetItem(mapping, w);
				1976	Py_DECREF(w);
				1977	if (x == NULL) {
				1978	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1979	/* No mapping found: default to Latin-1 mapping */
				1980	PyErr_Clear();
				1981	*p++ = (Py_UNICODE)ch;
				1982	continue;
				1983	}
				1984	goto onError;
				1985	}
				1986
				1987	/* Apply mapping */
				1988	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1989	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1990	if (value < 0 \|\| value > 65535) {
				1991	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1992	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1993	Py_DECREF(x);
				1994	goto onError;
				1995	}
				1996	*p++ = (Py_UNICODE)value;
				1997	}
				1998	else if (x == Py_None) {
				1999	/* undefined mapping */
				2000	if (charmap_decoding_error(&s, &p, errors,
				2001	"character maps to <undefined>")) {
				2002	Py_DECREF(x);
				2003	goto onError;
				2004	}
				2005	}
				2006	else if (PyUnicode_Check(x)) {
				2007	if (PyUnicode_GET_SIZE(x) != 1) {
				2008	/* 1-n mapping */
				2009	PyErr_SetString(PyExc_NotImplementedError,
				2010	"1-n mappings are currently not implemented");
				2011	Py_DECREF(x);
				2012	goto onError;
				2013	}
				2014	p++ = PyUnicode_AS_UNICODE(x);
				2015	}
				2016	else {
				2017	/* wrong return value */
				2018	PyErr_SetString(PyExc_TypeError,
				2019	"character mapping must return integer, None or unicode");
				2020	Py_DECREF(x);
				2021	goto onError;
				2022	}
				2023	Py_DECREF(x);
				2024	}
				2025	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2026	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2027	goto onError;
				2028	return (PyObject *)v;
				2029
				2030	onError:
				2031	Py_XDECREF(v);
				2032	return NULL;
				2033	}
				2034
				2035	static
				2036	int charmap_encoding_error(const Py_UNICODE **source,
				2037	char **dest,
				2038	const char *errors,
				2039	const char *details)
				2040	{
				2041	if ((errors == NULL) \|\|
				2042	(strcmp(errors,"strict") == 0)) {
				2043	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2044	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2045	details);
				2046	return -1;
				2047	}
				2048	else if (strcmp(errors,"ignore") == 0) {
				2049	return 0;
				2050	}
				2051	else if (strcmp(errors,"replace") == 0) {
				2052	**dest = '?';
				2053	(*dest)++;
				2054	return 0;
				2055	}
				2056	else {
				2057	PyErr_Format(PyExc_ValueError,
				2058	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2059	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2060	errors);
				2061	return -1;
				2062	}
				2063	}
				2064
				2065	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2066	int size,
				2067	PyObject *mapping,
				2068	const char *errors)
				2069	{
				2070	PyObject *v;
				2071	char *s;
				2072
				2073	/* Default to Latin-1 */
				2074	if (mapping == NULL)
				2075	return PyUnicode_EncodeLatin1(p, size, errors);
				2076
				2077	v = PyString_FromStringAndSize(NULL, size);
				2078	if (v == NULL)
				2079	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2080	if (size == 0)
				2081	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2082	s = PyString_AS_STRING(v);
				2083	while (size-- > 0) {
				2084	Py_UNICODE ch = *p++;
				2085	PyObject w, x;
				2086
				2087	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2088	w = PyInt_FromLong((long)ch);
				2089	if (w == NULL)
				2090	goto onError;
				2091	x = PyObject_GetItem(mapping, w);
				2092	Py_DECREF(w);
				2093	if (x == NULL) {
				2094	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2095	/* No mapping found: default to Latin-1 mapping if possible */
				2096	PyErr_Clear();
				2097	if (ch < 256) {
				2098	*s++ = (char)ch;
				2099	continue;
				2100	}
				2101	else if (!charmap_encoding_error(&p, &s, errors,
				2102	"missing character mapping"))
				2103	continue;
				2104	}
				2105	goto onError;
				2106	}
				2107
				2108	/* Apply mapping */
				2109	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2110	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2111	if (value < 0 \|\| value > 255) {
				2112	PyErr_SetString(PyExc_TypeError,
				2113	"character mapping must be in range(256)");
				2114	Py_DECREF(x);
				2115	goto onError;
				2116	}
				2117	*s++ = (char)value;
				2118	}
				2119	else if (x == Py_None) {
				2120	/* undefined mapping */
				2121	if (charmap_encoding_error(&p, &s, errors,
				2122	"character maps to <undefined>")) {
				2123	Py_DECREF(x);
				2124	goto onError;
				2125	}
				2126	}
				2127	else if (PyString_Check(x)) {
				2128	if (PyString_GET_SIZE(x) != 1) {
				2129	/* 1-n mapping */
				2130	PyErr_SetString(PyExc_NotImplementedError,
				2131	"1-n mappings are currently not implemented");
				2132	Py_DECREF(x);
				2133	goto onError;
				2134	}
				2135	s++ = PyString_AS_STRING(x);
				2136	}
				2137	else {
				2138	/* wrong return value */
				2139	PyErr_SetString(PyExc_TypeError,
				2140	"character mapping must return integer, None or unicode");
				2141	Py_DECREF(x);
				2142	goto onError;
				2143	}
				2144	Py_DECREF(x);
				2145	}
				2146	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2147	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2148	goto onError;
				2149	return v;
				2150
				2151	onError:
				2152	Py_DECREF(v);
				2153	return NULL;
				2154	}
				2155
				2156	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2157	PyObject *mapping)
				2158	{
				2159	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2160	PyErr_BadArgument();
				2161	return NULL;
				2162	}
				2163	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2164	PyUnicode_GET_SIZE(unicode),
				2165	mapping,
				2166	NULL);
				2167	}
				2168
				2169	static
				2170	int translate_error(const Py_UNICODE **source,
				2171	Py_UNICODE **dest,
				2172	const char *errors,
				2173	const char *details)
				2174	{
				2175	if ((errors == NULL) \|\|
				2176	(strcmp(errors,"strict") == 0)) {
				2177	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2178	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2179	details);
				2180	return -1;
				2181	}
				2182	else if (strcmp(errors,"ignore") == 0) {
				2183	return 0;
				2184	}
				2185	else if (strcmp(errors,"replace") == 0) {
				2186	**dest = '?';
				2187	(*dest)++;
				2188	return 0;
				2189	}
				2190	else {
				2191	PyErr_Format(PyExc_ValueError,
				2192	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2193	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2194	errors);
				2195	return -1;
				2196	}
				2197	}
				2198
				2199	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2200	int size,
				2201	PyObject *mapping,
				2202	const char *errors)
				2203	{
				2204	PyUnicodeObject *v;
				2205	Py_UNICODE *p;
				2206
				2207	if (mapping == NULL) {
				2208	PyErr_BadArgument();
				2209	return NULL;
				2210	}
				2211
				2212	/* Output will never be longer than input */
				2213	v = _PyUnicode_New(size);
				2214	if (v == NULL)
				2215	goto onError;
				2216	if (size == 0)
				2217	goto done;
				2218	p = PyUnicode_AS_UNICODE(v);
				2219	while (size-- > 0) {
				2220	Py_UNICODE ch = *s++;
				2221	PyObject w, x;
				2222
				2223	/* Get mapping */
				2224	w = PyInt_FromLong(ch);
				2225	if (w == NULL)
				2226	goto onError;
				2227	x = PyObject_GetItem(mapping, w);
				2228	Py_DECREF(w);
				2229	if (x == NULL) {
				2230	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2231	/* No mapping found: default to 1-1 mapping */
				2232	PyErr_Clear();
				2233	*p++ = ch;
				2234	continue;
				2235	}
				2236	goto onError;
				2237	}
				2238
				2239	/* Apply mapping */
				2240	if (PyInt_Check(x))
				2241	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2242	else if (x == Py_None) {
				2243	/* undefined mapping */
				2244	if (translate_error(&s, &p, errors,
				2245	"character maps to <undefined>")) {
				2246	Py_DECREF(x);
				2247	goto onError;
				2248	}
				2249	}
				2250	else if (PyUnicode_Check(x)) {
				2251	if (PyUnicode_GET_SIZE(x) != 1) {
				2252	/* 1-n mapping */
				2253	PyErr_SetString(PyExc_NotImplementedError,
				2254	"1-n mappings are currently not implemented");
				2255	Py_DECREF(x);
				2256	goto onError;
				2257	}
				2258	p++ = PyUnicode_AS_UNICODE(x);
				2259	}
				2260	else {
				2261	/* wrong return value */
				2262	PyErr_SetString(PyExc_TypeError,
				2263	"translate mapping must return integer, None or unicode");
				2264	Py_DECREF(x);
				2265	goto onError;
				2266	}
				2267	Py_DECREF(x);
				2268	}
				2269	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2270	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2271	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2272
				2273	done:
				2274	return (PyObject *)v;
				2275
				2276	onError:
				2277	Py_XDECREF(v);
				2278	return NULL;
				2279	}
				2280
				2281	PyObject PyUnicode_Translate(PyObject str,
				2282	PyObject *mapping,
				2283	const char *errors)
				2284	{
				2285	PyObject *result;
				2286
				2287	str = PyUnicode_FromObject(str);
				2288	if (str == NULL)
				2289	goto onError;
				2290	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2291	PyUnicode_GET_SIZE(str),
				2292	mapping,
				2293	errors);
				2294	Py_DECREF(str);
				2295	return result;
				2296
				2297	onError:
				2298	Py_XDECREF(str);
				2299	return NULL;
				2300	}
				2301
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2302	/* --- Decimal Encoder ---------------------------------------------------- */
				2303
				2304	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2305	int length,
				2306	char *output,
				2307	const char *errors)
				2308	{
				2309	Py_UNICODE p, end;
				2310
				2311	if (output == NULL) {
				2312	PyErr_BadArgument();
				2313	return -1;
				2314	}
				2315
				2316	p = s;
				2317	end = s + length;
				2318	while (p < end) {
				2319	register Py_UNICODE ch = *p++;
				2320	int decimal;
				2321
				2322	if (Py_UNICODE_ISSPACE(ch)) {
				2323	*output++ = ' ';
				2324	continue;
				2325	}
				2326	decimal = Py_UNICODE_TODECIMAL(ch);
				2327	if (decimal >= 0) {
				2328	*output++ = '0' + decimal;
				2329	continue;
				2330	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2331	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2332	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2333	continue;
				2334	}
				2335	/* All other characters are considered invalid */
				2336	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2337	PyErr_SetString(PyExc_ValueError,
				2338	"invalid decimal Unicode string");
				2339	goto onError;
				2340	}
				2341	else if (strcmp(errors, "ignore") == 0)
				2342	continue;
				2343	else if (strcmp(errors, "replace") == 0) {
				2344	*output++ = '?';
				2345	continue;
				2346	}
				2347	}
				2348	/* 0-terminate the output string */
				2349	*output++ = '\0';
				2350	return 0;
				2351
				2352	onError:
				2353	return -1;
				2354	}
				2355
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2356	/* --- Helpers ------------------------------------------------------------ */
				2357
				2358	static
				2359	int count(PyUnicodeObject *self,
				2360	int start,
				2361	int end,
				2362	PyUnicodeObject *substring)
				2363	{
				2364	int count = 0;
				2365
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2366	if (substring->length == 0)
				2367	return (end - start + 1);
				2368
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2369	end -= substring->length;
				2370
				2371	while (start <= end)
				2372	if (Py_UNICODE_MATCH(self, start, substring)) {
				2373	count++;
				2374	start += substring->length;
				2375	} else
				2376	start++;
				2377
				2378	return count;
				2379	}
				2380
				2381	int PyUnicode_Count(PyObject *str,
				2382	PyObject *substr,
				2383	int start,
				2384	int end)
				2385	{
				2386	int result;
				2387
				2388	str = PyUnicode_FromObject(str);
				2389	if (str == NULL)
				2390	return -1;
				2391	substr = PyUnicode_FromObject(substr);
				2392	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2393	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2394	return -1;
				2395	}
				2396
				2397	result = count((PyUnicodeObject *)str,
				2398	start, end,
				2399	(PyUnicodeObject *)substr);
				2400
				2401	Py_DECREF(str);
				2402	Py_DECREF(substr);
				2403	return result;
				2404	}
				2405
				2406	static
				2407	int findstring(PyUnicodeObject *self,
				2408	PyUnicodeObject *substring,
				2409	int start,
				2410	int end,
				2411	int direction)
				2412	{
				2413	if (start < 0)
				2414	start += self->length;
				2415	if (start < 0)
				2416	start = 0;
				2417
				2418	if (substring->length == 0)
				2419	return start;
				2420
				2421	if (end > self->length)
				2422	end = self->length;
				2423	if (end < 0)
				2424	end += self->length;
				2425	if (end < 0)
				2426	end = 0;
				2427
				2428	end -= substring->length;
				2429
				2430	if (direction < 0) {
				2431	for (; end >= start; end--)
				2432	if (Py_UNICODE_MATCH(self, end, substring))
				2433	return end;
				2434	} else {
				2435	for (; start <= end; start++)
				2436	if (Py_UNICODE_MATCH(self, start, substring))
				2437	return start;
				2438	}
				2439
				2440	return -1;
				2441	}
				2442
				2443	int PyUnicode_Find(PyObject *str,
				2444	PyObject *substr,
				2445	int start,
				2446	int end,
				2447	int direction)
				2448	{
				2449	int result;
				2450
				2451	str = PyUnicode_FromObject(str);
				2452	if (str == NULL)
				2453	return -1;
				2454	substr = PyUnicode_FromObject(substr);
				2455	if (substr == NULL) {
				2456	Py_DECREF(substr);
				2457	return -1;
				2458	}
				2459
				2460	result = findstring((PyUnicodeObject *)str,
				2461	(PyUnicodeObject *)substr,
				2462	start, end, direction);
				2463	Py_DECREF(str);
				2464	Py_DECREF(substr);
				2465	return result;
				2466	}
				2467
				2468	static
				2469	int tailmatch(PyUnicodeObject *self,
				2470	PyUnicodeObject *substring,
				2471	int start,
				2472	int end,
				2473	int direction)
				2474	{
				2475	if (start < 0)
				2476	start += self->length;
				2477	if (start < 0)
				2478	start = 0;
				2479
				2480	if (substring->length == 0)
				2481	return 1;
				2482
				2483	if (end > self->length)
				2484	end = self->length;
				2485	if (end < 0)
				2486	end += self->length;
				2487	if (end < 0)
				2488	end = 0;
				2489
				2490	end -= substring->length;
				2491	if (end < start)
				2492	return 0;
				2493
				2494	if (direction > 0) {
				2495	if (Py_UNICODE_MATCH(self, end, substring))
				2496	return 1;
				2497	} else {
				2498	if (Py_UNICODE_MATCH(self, start, substring))
				2499	return 1;
				2500	}
				2501
				2502	return 0;
				2503	}
				2504
				2505	int PyUnicode_Tailmatch(PyObject *str,
				2506	PyObject *substr,
				2507	int start,
				2508	int end,
				2509	int direction)
				2510	{
				2511	int result;
				2512
				2513	str = PyUnicode_FromObject(str);
				2514	if (str == NULL)
				2515	return -1;
				2516	substr = PyUnicode_FromObject(substr);
				2517	if (substr == NULL) {
				2518	Py_DECREF(substr);
				2519	return -1;
				2520	}
				2521
				2522	result = tailmatch((PyUnicodeObject *)str,
				2523	(PyUnicodeObject *)substr,
				2524	start, end, direction);
				2525	Py_DECREF(str);
				2526	Py_DECREF(substr);
				2527	return result;
				2528	}
				2529
				2530	static
				2531	const Py_UNICODE findchar(const Py_UNICODE s,
				2532	int size,
				2533	Py_UNICODE ch)
				2534	{
				2535	/* like wcschr, but doesn't stop at NULL characters */
				2536
				2537	while (size-- > 0) {
				2538	if (*s == ch)
				2539	return s;
				2540	s++;
				2541	}
				2542
				2543	return NULL;
				2544	}
				2545
				2546	/* Apply fixfct filter to the Unicode object self and return a
				2547	reference to the modified object */
				2548
				2549	static
				2550	PyObject fixup(PyUnicodeObject self,
				2551	int (fixfct)(PyUnicodeObject s))
				2552	{
				2553
				2554	PyUnicodeObject *u;
				2555
				2556	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2557	self->length);
				2558	if (u == NULL)
				2559	return NULL;
				2560	if (!fixfct(u)) {
				2561	/* fixfct should return TRUE if it modified the buffer. If
				2562	FALSE, return a reference to the original buffer instead
				2563	(to save space, not time) */
				2564	Py_INCREF(self);
				2565	Py_DECREF(u);
				2566	return (PyObject*) self;
				2567	}
				2568	return (PyObject*) u;
				2569	}
				2570
				2571	static
				2572	int fixupper(PyUnicodeObject *self)
				2573	{
				2574	int len = self->length;
				2575	Py_UNICODE *s = self->str;
				2576	int status = 0;
				2577
				2578	while (len-- > 0) {
				2579	register Py_UNICODE ch;
				2580
				2581	ch = Py_UNICODE_TOUPPER(*s);
				2582	if (ch != *s) {
				2583	status = 1;
				2584	*s = ch;
				2585	}
				2586	s++;
				2587	}
				2588
				2589	return status;
				2590	}
				2591
				2592	static
				2593	int fixlower(PyUnicodeObject *self)
				2594	{
				2595	int len = self->length;
				2596	Py_UNICODE *s = self->str;
				2597	int status = 0;
				2598
				2599	while (len-- > 0) {
				2600	register Py_UNICODE ch;
				2601
				2602	ch = Py_UNICODE_TOLOWER(*s);
				2603	if (ch != *s) {
				2604	status = 1;
				2605	*s = ch;
				2606	}
				2607	s++;
				2608	}
				2609
				2610	return status;
				2611	}
				2612
				2613	static
				2614	int fixswapcase(PyUnicodeObject *self)
				2615	{
				2616	int len = self->length;
				2617	Py_UNICODE *s = self->str;
				2618	int status = 0;
				2619
				2620	while (len-- > 0) {
				2621	if (Py_UNICODE_ISUPPER(*s)) {
				2622	s = Py_UNICODE_TOLOWER(s);
				2623	status = 1;
				2624	} else if (Py_UNICODE_ISLOWER(*s)) {
				2625	s = Py_UNICODE_TOUPPER(s);
				2626	status = 1;
				2627	}
				2628	s++;
				2629	}
				2630
				2631	return status;
				2632	}
				2633
				2634	static
				2635	int fixcapitalize(PyUnicodeObject *self)
				2636	{
				2637	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2638	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2639	return 1;
				2640	}
				2641	return 0;
				2642	}
				2643
				2644	static
				2645	int fixtitle(PyUnicodeObject *self)
				2646	{
				2647	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2648	register Py_UNICODE *e;
				2649	int previous_is_cased;
				2650
				2651	/* Shortcut for single character strings */
				2652	if (PyUnicode_GET_SIZE(self) == 1) {
				2653	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2654	if (*p != ch) {
				2655	*p = ch;
				2656	return 1;
				2657	}
				2658	else
				2659	return 0;
				2660	}
				2661
				2662	e = p + PyUnicode_GET_SIZE(self);
				2663	previous_is_cased = 0;
				2664	for (; p < e; p++) {
				2665	register const Py_UNICODE ch = *p;
				2666
				2667	if (previous_is_cased)
				2668	*p = Py_UNICODE_TOLOWER(ch);
				2669	else
				2670	*p = Py_UNICODE_TOTITLE(ch);
				2671
				2672	if (Py_UNICODE_ISLOWER(ch) \|\|
				2673	Py_UNICODE_ISUPPER(ch) \|\|
				2674	Py_UNICODE_ISTITLE(ch))
				2675	previous_is_cased = 1;
				2676	else
				2677	previous_is_cased = 0;
				2678	}
				2679	return 1;
				2680	}
				2681
				2682	PyObject PyUnicode_Join(PyObject separator,
				2683	PyObject *seq)
				2684	{
				2685	Py_UNICODE *sep;
				2686	int seplen;
				2687	PyUnicodeObject *res = NULL;
				2688	int reslen = 0;
				2689	Py_UNICODE *p;
				2690	int seqlen = 0;
				2691	int sz = 100;
				2692	int i;
				2693
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2694	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2695	if (seqlen < 0 && PyErr_Occurred())
				2696	return NULL;
				2697
				2698	if (separator == NULL) {
				2699	Py_UNICODE blank = ' ';
				2700	sep = &blank;
				2701	seplen = 1;
				2702	}
				2703	else {
				2704	separator = PyUnicode_FromObject(separator);
				2705	if (separator == NULL)
				2706	return NULL;
				2707	sep = PyUnicode_AS_UNICODE(separator);
				2708	seplen = PyUnicode_GET_SIZE(separator);
				2709	}
				2710
				2711	res = _PyUnicode_New(sz);
				2712	if (res == NULL)
				2713	goto onError;
				2714	p = PyUnicode_AS_UNICODE(res);
				2715	reslen = 0;
				2716
				2717	for (i = 0; i < seqlen; i++) {
				2718	int itemlen;
				2719	PyObject *item;
				2720
				2721	item = PySequence_GetItem(seq, i);
				2722	if (item == NULL)
				2723	goto onError;
				2724	if (!PyUnicode_Check(item)) {
				2725	PyObject *v;
				2726	v = PyUnicode_FromObject(item);
				2727	Py_DECREF(item);
				2728	item = v;
				2729	if (item == NULL)
				2730	goto onError;
				2731	}
				2732	itemlen = PyUnicode_GET_SIZE(item);
				2733	while (reslen + itemlen + seplen >= sz) {
				2734	if (_PyUnicode_Resize(res, sz*2))
				2735	goto onError;
				2736	sz *= 2;
				2737	p = PyUnicode_AS_UNICODE(res) + reslen;
				2738	}
				2739	if (i > 0) {
				2740	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2741	p += seplen;
				2742	reslen += seplen;
				2743	}
				2744	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2745	p += itemlen;
				2746	reslen += itemlen;
				2747	Py_DECREF(item);
				2748	}
				2749	if (_PyUnicode_Resize(res, reslen))
				2750	goto onError;
				2751
				2752	Py_XDECREF(separator);
				2753	return (PyObject *)res;
				2754
				2755	onError:
				2756	Py_XDECREF(separator);
				2757	Py_DECREF(res);
				2758	return NULL;
				2759	}
				2760
				2761	static
				2762	PyUnicodeObject pad(PyUnicodeObject self,
				2763	int left,
				2764	int right,
				2765	Py_UNICODE fill)
				2766	{
				2767	PyUnicodeObject *u;
				2768
				2769	if (left < 0)
				2770	left = 0;
				2771	if (right < 0)
				2772	right = 0;
				2773
				2774	if (left == 0 && right == 0) {
				2775	Py_INCREF(self);
				2776	return self;
				2777	}
				2778
				2779	u = _PyUnicode_New(left + self->length + right);
				2780	if (u) {
				2781	if (left)
				2782	Py_UNICODE_FILL(u->str, fill, left);
				2783	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2784	if (right)
				2785	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2786	}
				2787
				2788	return u;
				2789	}
				2790
				2791	#define SPLIT_APPEND(data, left, right) \
				2792	str = PyUnicode_FromUnicode(data + left, right - left); \
				2793	if (!str) \
				2794	goto onError; \
				2795	if (PyList_Append(list, str)) { \
				2796	Py_DECREF(str); \
				2797	goto onError; \
				2798	} \
				2799	else \
				2800	Py_DECREF(str);
				2801
				2802	static
				2803	PyObject split_whitespace(PyUnicodeObject self,
				2804	PyObject *list,
				2805	int maxcount)
				2806	{
				2807	register int i;
				2808	register int j;
				2809	int len = self->length;
				2810	PyObject *str;
				2811
				2812	for (i = j = 0; i < len; ) {
				2813	/* find a token */
				2814	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2815	i++;
				2816	j = i;
				2817	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2818	i++;
				2819	if (j < i) {
				2820	if (maxcount-- <= 0)
				2821	break;
				2822	SPLIT_APPEND(self->str, j, i);
				2823	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2824	i++;
				2825	j = i;
				2826	}
				2827	}
				2828	if (j < len) {
				2829	SPLIT_APPEND(self->str, j, len);
				2830	}
				2831	return list;
				2832
				2833	onError:
				2834	Py_DECREF(list);
				2835	return NULL;
				2836	}
				2837
				2838	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2839	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2840	{
				2841	register int i;
				2842	register int j;
				2843	int len;
				2844	PyObject *list;
				2845	PyObject *str;
				2846	Py_UNICODE *data;
				2847
				2848	string = PyUnicode_FromObject(string);
				2849	if (string == NULL)
				2850	return NULL;
				2851	data = PyUnicode_AS_UNICODE(string);
				2852	len = PyUnicode_GET_SIZE(string);
				2853
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2854	list = PyList_New(0);
				2855	if (!list)
				2856	goto onError;
				2857
				2858	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2859	int eol;
				2860
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2861	/* Find a line and append it */
				2862	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2863	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2864
				2865	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2866	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2867	if (i < len) {
				2868	if (data[i] == '\r' && i + 1 < len &&
				2869	data[i+1] == '\n')
				2870	i += 2;
				2871	else
				2872	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2873	if (keepends)
				2874	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2875	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2876	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2877	j = i;
				2878	}
				2879	if (j < len) {
				2880	SPLIT_APPEND(data, j, len);
				2881	}
				2882
				2883	Py_DECREF(string);
				2884	return list;
				2885
				2886	onError:
				2887	Py_DECREF(list);
				2888	Py_DECREF(string);
				2889	return NULL;
				2890	}
				2891
				2892	static
				2893	PyObject split_char(PyUnicodeObject self,
				2894	PyObject *list,
				2895	Py_UNICODE ch,
				2896	int maxcount)
				2897	{
				2898	register int i;
				2899	register int j;
				2900	int len = self->length;
				2901	PyObject *str;
				2902
				2903	for (i = j = 0; i < len; ) {
				2904	if (self->str[i] == ch) {
				2905	if (maxcount-- <= 0)
				2906	break;
				2907	SPLIT_APPEND(self->str, j, i);
				2908	i = j = i + 1;
				2909	} else
				2910	i++;
				2911	}
				2912	if (j <= len) {
				2913	SPLIT_APPEND(self->str, j, len);
				2914	}
				2915	return list;
				2916
				2917	onError:
				2918	Py_DECREF(list);
				2919	return NULL;
				2920	}
				2921
				2922	static
				2923	PyObject split_substring(PyUnicodeObject self,
				2924	PyObject *list,
				2925	PyUnicodeObject *substring,
				2926	int maxcount)
				2927	{
				2928	register int i;
				2929	register int j;
				2930	int len = self->length;
				2931	int sublen = substring->length;
				2932	PyObject *str;
				2933
				2934	for (i = j = 0; i < len - sublen; ) {
				2935	if (Py_UNICODE_MATCH(self, i, substring)) {
				2936	if (maxcount-- <= 0)
				2937	break;
				2938	SPLIT_APPEND(self->str, j, i);
				2939	i = j = i + sublen;
				2940	} else
				2941	i++;
				2942	}
				2943	if (j <= len) {
				2944	SPLIT_APPEND(self->str, j, len);
				2945	}
				2946	return list;
				2947
				2948	onError:
				2949	Py_DECREF(list);
				2950	return NULL;
				2951	}
				2952
				2953	#undef SPLIT_APPEND
				2954
				2955	static
				2956	PyObject split(PyUnicodeObject self,
				2957	PyUnicodeObject *substring,
				2958	int maxcount)
				2959	{
				2960	PyObject *list;
				2961
				2962	if (maxcount < 0)
				2963	maxcount = INT_MAX;
				2964
				2965	list = PyList_New(0);
				2966	if (!list)
				2967	return NULL;
				2968
				2969	if (substring == NULL)
				2970	return split_whitespace(self,list,maxcount);
				2971
				2972	else if (substring->length == 1)
				2973	return split_char(self,list,substring->str[0],maxcount);
				2974
				2975	else if (substring->length == 0) {
				2976	Py_DECREF(list);
				2977	PyErr_SetString(PyExc_ValueError, "empty separator");
				2978	return NULL;
				2979	}
				2980	else
				2981	return split_substring(self,list,substring,maxcount);
				2982	}
				2983
				2984	static
				2985	PyObject strip(PyUnicodeObject self,
				2986	int left,
				2987	int right)
				2988	{
				2989	Py_UNICODE *p = self->str;
				2990	int start = 0;
				2991	int end = self->length;
				2992
				2993	if (left)
				2994	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2995	start++;
				2996
				2997	if (right)
				2998	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2999	end--;
				3000
				3001	if (start == 0 && end == self->length) {
				3002	/* couldn't strip anything off, return original string */
				3003	Py_INCREF(self);
				3004	return (PyObject*) self;
				3005	}
				3006
				3007	return (PyObject*) PyUnicode_FromUnicode(
				3008	self->str + start,
				3009	end - start
				3010	);
				3011	}
				3012
				3013	static
				3014	PyObject replace(PyUnicodeObject self,
				3015	PyUnicodeObject *str1,
				3016	PyUnicodeObject *str2,
				3017	int maxcount)
				3018	{
				3019	PyUnicodeObject *u;
				3020
				3021	if (maxcount < 0)
				3022	maxcount = INT_MAX;
				3023
				3024	if (str1->length == 1 && str2->length == 1) {
				3025	int i;
				3026
				3027	/* replace characters */
				3028	if (!findchar(self->str, self->length, str1->str[0])) {
				3029	/* nothing to replace, return original string */
				3030	Py_INCREF(self);
				3031	u = self;
				3032	} else {
				3033	Py_UNICODE u1 = str1->str[0];
				3034	Py_UNICODE u2 = str2->str[0];
				3035
				3036	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3037	self->str,
				3038	self->length
				3039	);
				3040	if (u)
				3041	for (i = 0; i < u->length; i++)
				3042	if (u->str[i] == u1) {
				3043	if (--maxcount < 0)
				3044	break;
				3045	u->str[i] = u2;
				3046	}
				3047	}
				3048
				3049	} else {
				3050	int n, i;
				3051	Py_UNICODE *p;
				3052
				3053	/* replace strings */
				3054	n = count(self, 0, self->length, str1);
				3055	if (n > maxcount)
				3056	n = maxcount;
				3057	if (n == 0) {
				3058	/* nothing to replace, return original string */
				3059	Py_INCREF(self);
				3060	u = self;
				3061	} else {
				3062	u = _PyUnicode_New(
				3063	self->length + n * (str2->length - str1->length));
				3064	if (u) {
				3065	i = 0;
				3066	p = u->str;
				3067	while (i <= self->length - str1->length)
				3068	if (Py_UNICODE_MATCH(self, i, str1)) {
				3069	/* replace string segment */
				3070	Py_UNICODE_COPY(p, str2->str, str2->length);
				3071	p += str2->length;
				3072	i += str1->length;
				3073	if (--n <= 0) {
				3074	/* copy remaining part */
				3075	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3076	break;
				3077	}
				3078	} else
				3079	*p++ = self->str[i++];
				3080	}
				3081	}
				3082	}
				3083
				3084	return (PyObject *) u;
				3085	}
				3086
				3087	/* --- Unicode Object Methods --------------------------------------------- */
				3088
				3089	static char title__doc__[] =
				3090	"S.title() -> unicode\n\
				3091	\n\
				3092	Return a titlecased version of S, i.e. words start with title case\n\
				3093	characters, all remaining cased characters have lower case.";
				3094
				3095	static PyObject*
				3096	unicode_title(PyUnicodeObject self, PyObject args)
				3097	{
				3098	if (!PyArg_NoArgs(args))
				3099	return NULL;
				3100	return fixup(self, fixtitle);
				3101	}
				3102
				3103	static char capitalize__doc__[] =
				3104	"S.capitalize() -> unicode\n\
				3105	\n\
				3106	Return a capitalized version of S, i.e. make the first character\n\
				3107	have upper case.";
				3108
				3109	static PyObject*
				3110	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3111	{
				3112	if (!PyArg_NoArgs(args))
				3113	return NULL;
				3114	return fixup(self, fixcapitalize);
				3115	}
				3116
				3117	#if 0
				3118	static char capwords__doc__[] =
				3119	"S.capwords() -> unicode\n\
				3120	\n\
				3121	Apply .capitalize() to all words in S and return the result with\n\
				3122	normalized whitespace (all whitespace strings are replaced by ' ').";
				3123
				3124	static PyObject*
				3125	unicode_capwords(PyUnicodeObject self, PyObject args)
				3126	{
				3127	PyObject *list;
				3128	PyObject *item;
				3129	int i;
				3130
				3131	if (!PyArg_NoArgs(args))
				3132	return NULL;
				3133
				3134	/* Split into words */
				3135	list = split(self, NULL, -1);
				3136	if (!list)
				3137	return NULL;
				3138
				3139	/* Capitalize each word */
				3140	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3141	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3142	fixcapitalize);
				3143	if (item == NULL)
				3144	goto onError;
				3145	Py_DECREF(PyList_GET_ITEM(list, i));
				3146	PyList_SET_ITEM(list, i, item);
				3147	}
				3148
				3149	/* Join the words to form a new string */
				3150	item = PyUnicode_Join(NULL, list);
				3151
				3152	onError:
				3153	Py_DECREF(list);
				3154	return (PyObject *)item;
				3155	}
				3156	#endif
				3157
				3158	static char center__doc__[] =
				3159	"S.center(width) -> unicode\n\
				3160	\n\
				3161	Return S centered in a Unicode string of length width. Padding is done\n\
				3162	using spaces.";
				3163
				3164	static PyObject *
				3165	unicode_center(PyUnicodeObject self, PyObject args)
				3166	{
				3167	int marg, left;
				3168	int width;
				3169
				3170	if (!PyArg_ParseTuple(args, "i:center", &width))
				3171	return NULL;
				3172
				3173	if (self->length >= width) {
				3174	Py_INCREF(self);
				3175	return (PyObject*) self;
				3176	}
				3177
				3178	marg = width - self->length;
				3179	left = marg / 2 + (marg & width & 1);
				3180
				3181	return (PyObject*) pad(self, left, marg - left, ' ');
				3182	}
				3183
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3184	#if 0
				3185
				3186	/* This code should go into some future Unicode collation support
				3187	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3188	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3189
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3190	/* speedy UTF-16 code point order comparison */
				3191	/* gleaned from: */
				3192	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3193
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3194	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3195	{
				3196	0, 0, 0, 0, 0, 0, 0, 0,
				3197	0, 0, 0, 0, 0, 0, 0, 0,
				3198	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3199	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3200	};
				3201
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3202	static int
				3203	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3204	{
				3205	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3206
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3207	Py_UNICODE *s1 = str1->str;
				3208	Py_UNICODE *s2 = str2->str;
				3209
				3210	len1 = str1->length;
				3211	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3212
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3213	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3214	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3215	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3216
				3217	c1 = *s1++;
				3218	c2 = *s2++;
				3219	if (c1 > (1<<11) * 26)
				3220	c1 += utf16Fixup[c1>>11];
				3221	if (c2 > (1<<11) * 26)
				3222	c2 += utf16Fixup[c2>>11];
				3223
				3224	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3225	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3226	if (diff)
				3227	return (diff < 0) ? -1 : (diff != 0);
				3228	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3229	}
				3230
				3231	return (len1 < len2) ? -1 : (len1 != len2);
				3232	}
				3233
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3234	#else
				3235
				3236	static int
				3237	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3238	{
				3239	register int len1, len2;
				3240
				3241	Py_UNICODE *s1 = str1->str;
				3242	Py_UNICODE *s2 = str2->str;
				3243
				3244	len1 = str1->length;
				3245	len2 = str2->length;
				3246
				3247	while (len1 > 0 && len2 > 0) {
				3248	register long diff;
				3249
				3250	diff = (long)s1++ - (long)s2++;
				3251	if (diff)
				3252	return (diff < 0) ? -1 : (diff != 0);
				3253	len1--; len2--;
				3254	}
				3255
				3256	return (len1 < len2) ? -1 : (len1 != len2);
				3257	}
				3258
				3259	#endif
				3260
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3261	int PyUnicode_Compare(PyObject *left,
				3262	PyObject *right)
				3263	{
				3264	PyUnicodeObject u = NULL, v = NULL;
				3265	int result;
				3266
				3267	/* Coerce the two arguments */
				3268	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3269	if (u == NULL)
				3270	goto onError;
				3271	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3272	if (v == NULL)
				3273	goto onError;
				3274
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3275	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3276	if (v == u) {
				3277	Py_DECREF(u);
				3278	Py_DECREF(v);
				3279	return 0;
				3280	}
				3281
				3282	result = unicode_compare(u, v);
				3283
				3284	Py_DECREF(u);
				3285	Py_DECREF(v);
				3286	return result;
				3287
				3288	onError:
				3289	Py_XDECREF(u);
				3290	Py_XDECREF(v);
				3291	return -1;
				3292	}
				3293
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3294	int PyUnicode_Contains(PyObject *container,
				3295	PyObject *element)
				3296	{
				3297	PyUnicodeObject u = NULL, v = NULL;
				3298	int result;
				3299	register const Py_UNICODE p, e;
				3300	register Py_UNICODE ch;
				3301
				3302	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3303	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3304	if (v == NULL) {
				3305	PyErr_SetString(PyExc_TypeError,
				3306	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3307	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3308	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3309	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3310	if (u == NULL) {
				3311	Py_DECREF(v);
				3312	goto onError;
				3313	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3314
				3315	/* Check v in u */
				3316	if (PyUnicode_GET_SIZE(v) != 1) {
				3317	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3318	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3319	goto onError;
				3320	}
				3321	ch = *PyUnicode_AS_UNICODE(v);
				3322	p = PyUnicode_AS_UNICODE(u);
				3323	e = p + PyUnicode_GET_SIZE(u);
				3324	result = 0;
				3325	while (p < e) {
				3326	if (*p++ == ch) {
				3327	result = 1;
				3328	break;
				3329	}
				3330	}
				3331
				3332	Py_DECREF(u);
				3333	Py_DECREF(v);
				3334	return result;
				3335
				3336	onError:
				3337	Py_XDECREF(u);
				3338	Py_XDECREF(v);
				3339	return -1;
				3340	}
				3341
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3342	/* Concat to string or Unicode object giving a new Unicode object. */
				3343
				3344	PyObject PyUnicode_Concat(PyObject left,
				3345	PyObject *right)
				3346	{
				3347	PyUnicodeObject u = NULL, v = NULL, *w;
				3348
				3349	/* Coerce the two arguments */
				3350	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3351	if (u == NULL)
				3352	goto onError;
				3353	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3354	if (v == NULL)
				3355	goto onError;
				3356
				3357	/* Shortcuts */
				3358	if (v == unicode_empty) {
				3359	Py_DECREF(v);
				3360	return (PyObject *)u;
				3361	}
				3362	if (u == unicode_empty) {
				3363	Py_DECREF(u);
				3364	return (PyObject *)v;
				3365	}
				3366
				3367	/* Concat the two Unicode strings */
				3368	w = _PyUnicode_New(u->length + v->length);
				3369	if (w == NULL)
				3370	goto onError;
				3371	Py_UNICODE_COPY(w->str, u->str, u->length);
				3372	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3373
				3374	Py_DECREF(u);
				3375	Py_DECREF(v);
				3376	return (PyObject *)w;
				3377
				3378	onError:
				3379	Py_XDECREF(u);
				3380	Py_XDECREF(v);
				3381	return NULL;
				3382	}
				3383
				3384	static char count__doc__[] =
				3385	"S.count(sub[, start[, end]]) -> int\n\
				3386	\n\
				3387	Return the number of occurrences of substring sub in Unicode string\n\
				3388	S[start:end]. Optional arguments start and end are\n\
				3389	interpreted as in slice notation.";
				3390
				3391	static PyObject *
				3392	unicode_count(PyUnicodeObject self, PyObject args)
				3393	{
				3394	PyUnicodeObject *substring;
				3395	int start = 0;
				3396	int end = INT_MAX;
				3397	PyObject *result;
				3398
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3399	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3400	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3401	return NULL;
				3402
				3403	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3404	(PyObject *)substring);
				3405	if (substring == NULL)
				3406	return NULL;
				3407
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3408	if (start < 0)
				3409	start += self->length;
				3410	if (start < 0)
				3411	start = 0;
				3412	if (end > self->length)
				3413	end = self->length;
				3414	if (end < 0)
				3415	end += self->length;
				3416	if (end < 0)
				3417	end = 0;
				3418
				3419	result = PyInt_FromLong((long) count(self, start, end, substring));
				3420
				3421	Py_DECREF(substring);
				3422	return result;
				3423	}
				3424
				3425	static char encode__doc__[] =
				3426	"S.encode([encoding[,errors]]) -> string\n\
				3427	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3428	Return an encoded string version of S. Default encoding is the current\n\
				3429	default string encoding. errors may be given to set a different error\n\
				3430	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3431	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3432
				3433	static PyObject *
				3434	unicode_encode(PyUnicodeObject self, PyObject args)
				3435	{
				3436	char *encoding = NULL;
				3437	char *errors = NULL;
				3438	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3439	return NULL;
				3440	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3441	}
				3442
				3443	static char expandtabs__doc__[] =
				3444	"S.expandtabs([tabsize]) -> unicode\n\
				3445	\n\
				3446	Return a copy of S where all tab characters are expanded using spaces.\n\
				3447	If tabsize is not given, a tab size of 8 characters is assumed.";
				3448
				3449	static PyObject*
				3450	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3451	{
				3452	Py_UNICODE *e;
				3453	Py_UNICODE *p;
				3454	Py_UNICODE *q;
				3455	int i, j;
				3456	PyUnicodeObject *u;
				3457	int tabsize = 8;
				3458
				3459	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3460	return NULL;
				3461
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3462	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3463	i = j = 0;
				3464	e = self->str + self->length;
				3465	for (p = self->str; p < e; p++)
				3466	if (*p == '\t') {
				3467	if (tabsize > 0)
				3468	j += tabsize - (j % tabsize);
				3469	}
				3470	else {
				3471	j++;
				3472	if (p == '\n' \|\| p == '\r') {
				3473	i += j;
				3474	j = 0;
				3475	}
				3476	}
				3477
				3478	/* Second pass: create output string and fill it */
				3479	u = _PyUnicode_New(i + j);
				3480	if (!u)
				3481	return NULL;
				3482
				3483	j = 0;
				3484	q = u->str;
				3485
				3486	for (p = self->str; p < e; p++)
				3487	if (*p == '\t') {
				3488	if (tabsize > 0) {
				3489	i = tabsize - (j % tabsize);
				3490	j += i;
				3491	while (i--)
				3492	*q++ = ' ';
				3493	}
				3494	}
				3495	else {
				3496	j++;
				3497	q++ = p;
				3498	if (p == '\n' \|\| p == '\r')
				3499	j = 0;
				3500	}
				3501
				3502	return (PyObject*) u;
				3503	}
				3504
				3505	static char find__doc__[] =
				3506	"S.find(sub [,start [,end]]) -> int\n\
				3507	\n\
				3508	Return the lowest index in S where substring sub is found,\n\
				3509	such that sub is contained within s[start,end]. Optional\n\
				3510	arguments start and end are interpreted as in slice notation.\n\
				3511	\n\
				3512	Return -1 on failure.";
				3513
				3514	static PyObject *
				3515	unicode_find(PyUnicodeObject self, PyObject args)
				3516	{
				3517	PyUnicodeObject *substring;
				3518	int start = 0;
				3519	int end = INT_MAX;
				3520	PyObject *result;
				3521
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3522	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3523	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3524	return NULL;
				3525	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3526	(PyObject *)substring);
				3527	if (substring == NULL)
				3528	return NULL;
				3529
				3530	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3531
				3532	Py_DECREF(substring);
				3533	return result;
				3534	}
				3535
				3536	static PyObject *
				3537	unicode_getitem(PyUnicodeObject *self, int index)
				3538	{
				3539	if (index < 0 \|\| index >= self->length) {
				3540	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3541	return NULL;
				3542	}
				3543
				3544	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3545	}
				3546
				3547	static long
				3548	unicode_hash(PyUnicodeObject *self)
				3549	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3550	/* Since Unicode objects compare equal to their ASCII string
				3551	counterparts, they should use the individual character values
				3552	as basis for their hash value. This is needed to assure that
				3553	strings and Unicode objects behave in the same way as
				3554	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3555
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3556	register int len;
				3557	register Py_UNICODE *p;
				3558	register long x;
				3559
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3560	if (self->hash != -1)
				3561	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3562	len = PyUnicode_GET_SIZE(self);
				3563	p = PyUnicode_AS_UNICODE(self);
				3564	x = *p << 7;
				3565	while (--len >= 0)
				3566	x = (1000003x) ^ p++;
				3567	x ^= PyUnicode_GET_SIZE(self);
				3568	if (x == -1)
				3569	x = -2;
				3570	self->hash = x;
				3571	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3572	}
				3573
				3574	static char index__doc__[] =
				3575	"S.index(sub [,start [,end]]) -> int\n\
				3576	\n\
				3577	Like S.find() but raise ValueError when the substring is not found.";
				3578
				3579	static PyObject *
				3580	unicode_index(PyUnicodeObject self, PyObject args)
				3581	{
				3582	int result;
				3583	PyUnicodeObject *substring;
				3584	int start = 0;
				3585	int end = INT_MAX;
				3586
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3587	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3588	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3589	return NULL;
				3590
				3591	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3592	(PyObject *)substring);
				3593	if (substring == NULL)
				3594	return NULL;
				3595
				3596	result = findstring(self, substring, start, end, 1);
				3597
				3598	Py_DECREF(substring);
				3599	if (result < 0) {
				3600	PyErr_SetString(PyExc_ValueError, "substring not found");
				3601	return NULL;
				3602	}
				3603	return PyInt_FromLong(result);
				3604	}
				3605
				3606	static char islower__doc__[] =
				3607	"S.islower() -> int\n\
				3608	\n\
				3609	Return 1 if all cased characters in S are lowercase and there is\n\
				3610	at least one cased character in S, 0 otherwise.";
				3611
				3612	static PyObject*
				3613	unicode_islower(PyUnicodeObject self, PyObject args)
				3614	{
				3615	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3616	register const Py_UNICODE *e;
				3617	int cased;
				3618
				3619	if (!PyArg_NoArgs(args))
				3620	return NULL;
				3621
				3622	/* Shortcut for single character strings */
				3623	if (PyUnicode_GET_SIZE(self) == 1)
				3624	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3625
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3626	/* Special case for empty strings */
				3627	if (PyString_GET_SIZE(self) == 0)
				3628	return PyInt_FromLong(0);
				3629
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3630	e = p + PyUnicode_GET_SIZE(self);
				3631	cased = 0;
				3632	for (; p < e; p++) {
				3633	register const Py_UNICODE ch = *p;
				3634
				3635	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3636	return PyInt_FromLong(0);
				3637	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3638	cased = 1;
				3639	}
				3640	return PyInt_FromLong(cased);
				3641	}
				3642
				3643	static char isupper__doc__[] =
				3644	"S.isupper() -> int\n\
				3645	\n\
				3646	Return 1 if all cased characters in S are uppercase and there is\n\
				3647	at least one cased character in S, 0 otherwise.";
				3648
				3649	static PyObject*
				3650	unicode_isupper(PyUnicodeObject self, PyObject args)
				3651	{
				3652	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3653	register const Py_UNICODE *e;
				3654	int cased;
				3655
				3656	if (!PyArg_NoArgs(args))
				3657	return NULL;
				3658
				3659	/* Shortcut for single character strings */
				3660	if (PyUnicode_GET_SIZE(self) == 1)
				3661	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3662
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3663	/* Special case for empty strings */
				3664	if (PyString_GET_SIZE(self) == 0)
				3665	return PyInt_FromLong(0);
				3666
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3667	e = p + PyUnicode_GET_SIZE(self);
				3668	cased = 0;
				3669	for (; p < e; p++) {
				3670	register const Py_UNICODE ch = *p;
				3671
				3672	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3673	return PyInt_FromLong(0);
				3674	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3675	cased = 1;
				3676	}
				3677	return PyInt_FromLong(cased);
				3678	}
				3679
				3680	static char istitle__doc__[] =
				3681	"S.istitle() -> int\n\
				3682	\n\
				3683	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3684	may only follow uncased characters and lowercase characters only cased\n\
				3685	ones. Return 0 otherwise.";
				3686
				3687	static PyObject*
				3688	unicode_istitle(PyUnicodeObject self, PyObject args)
				3689	{
				3690	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3691	register const Py_UNICODE *e;
				3692	int cased, previous_is_cased;
				3693
				3694	if (!PyArg_NoArgs(args))
				3695	return NULL;
				3696
				3697	/* Shortcut for single character strings */
				3698	if (PyUnicode_GET_SIZE(self) == 1)
				3699	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3700	(Py_UNICODE_ISUPPER(*p) != 0));
				3701
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3702	/* Special case for empty strings */
				3703	if (PyString_GET_SIZE(self) == 0)
				3704	return PyInt_FromLong(0);
				3705
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3706	e = p + PyUnicode_GET_SIZE(self);
				3707	cased = 0;
				3708	previous_is_cased = 0;
				3709	for (; p < e; p++) {
				3710	register const Py_UNICODE ch = *p;
				3711
				3712	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3713	if (previous_is_cased)
				3714	return PyInt_FromLong(0);
				3715	previous_is_cased = 1;
				3716	cased = 1;
				3717	}
				3718	else if (Py_UNICODE_ISLOWER(ch)) {
				3719	if (!previous_is_cased)
				3720	return PyInt_FromLong(0);
				3721	previous_is_cased = 1;
				3722	cased = 1;
				3723	}
				3724	else
				3725	previous_is_cased = 0;
				3726	}
				3727	return PyInt_FromLong(cased);
				3728	}
				3729
				3730	static char isspace__doc__[] =
				3731	"S.isspace() -> int\n\
				3732	\n\
				3733	Return 1 if there are only whitespace characters in S,\n\
				3734	0 otherwise.";
				3735
				3736	static PyObject*
				3737	unicode_isspace(PyUnicodeObject self, PyObject args)
				3738	{
				3739	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3740	register const Py_UNICODE *e;
				3741
				3742	if (!PyArg_NoArgs(args))
				3743	return NULL;
				3744
				3745	/* Shortcut for single character strings */
				3746	if (PyUnicode_GET_SIZE(self) == 1 &&
				3747	Py_UNICODE_ISSPACE(*p))
				3748	return PyInt_FromLong(1);
				3749
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3750	/* Special case for empty strings */
				3751	if (PyString_GET_SIZE(self) == 0)
				3752	return PyInt_FromLong(0);
				3753
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3754	e = p + PyUnicode_GET_SIZE(self);
				3755	for (; p < e; p++) {
				3756	if (!Py_UNICODE_ISSPACE(*p))
				3757	return PyInt_FromLong(0);
				3758	}
				3759	return PyInt_FromLong(1);
				3760	}
				3761
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3762	static char isalpha__doc__[] =
				3763	"S.isalpha() -> int\n\
				3764	\n\
				3765	Return 1 if all characters in S are alphabetic\n\
				3766	and there is at least one character in S, 0 otherwise.";
				3767
				3768	static PyObject*
				3769	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3770	{
				3771	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3772	register const Py_UNICODE *e;
				3773
				3774	if (!PyArg_NoArgs(args))
				3775	return NULL;
				3776
				3777	/* Shortcut for single character strings */
				3778	if (PyUnicode_GET_SIZE(self) == 1 &&
				3779	Py_UNICODE_ISALPHA(*p))
				3780	return PyInt_FromLong(1);
				3781
				3782	/* Special case for empty strings */
				3783	if (PyString_GET_SIZE(self) == 0)
				3784	return PyInt_FromLong(0);
				3785
				3786	e = p + PyUnicode_GET_SIZE(self);
				3787	for (; p < e; p++) {
				3788	if (!Py_UNICODE_ISALPHA(*p))
				3789	return PyInt_FromLong(0);
				3790	}
				3791	return PyInt_FromLong(1);
				3792	}
				3793
				3794	static char isalnum__doc__[] =
				3795	"S.isalnum() -> int\n\
				3796	\n\
				3797	Return 1 if all characters in S are alphanumeric\n\
				3798	and there is at least one character in S, 0 otherwise.";
				3799
				3800	static PyObject*
				3801	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3802	{
				3803	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3804	register const Py_UNICODE *e;
				3805
				3806	if (!PyArg_NoArgs(args))
				3807	return NULL;
				3808
				3809	/* Shortcut for single character strings */
				3810	if (PyUnicode_GET_SIZE(self) == 1 &&
				3811	Py_UNICODE_ISALNUM(*p))
				3812	return PyInt_FromLong(1);
				3813
				3814	/* Special case for empty strings */
				3815	if (PyString_GET_SIZE(self) == 0)
				3816	return PyInt_FromLong(0);
				3817
				3818	e = p + PyUnicode_GET_SIZE(self);
				3819	for (; p < e; p++) {
				3820	if (!Py_UNICODE_ISALNUM(*p))
				3821	return PyInt_FromLong(0);
				3822	}
				3823	return PyInt_FromLong(1);
				3824	}
				3825
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3826	static char isdecimal__doc__[] =
				3827	"S.isdecimal() -> int\n\
				3828	\n\
				3829	Return 1 if there are only decimal characters in S,\n\
				3830	0 otherwise.";
				3831
				3832	static PyObject*
				3833	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3834	{
				3835	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3836	register const Py_UNICODE *e;
				3837
				3838	if (!PyArg_NoArgs(args))
				3839	return NULL;
				3840
				3841	/* Shortcut for single character strings */
				3842	if (PyUnicode_GET_SIZE(self) == 1 &&
				3843	Py_UNICODE_ISDECIMAL(*p))
				3844	return PyInt_FromLong(1);
				3845
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3846	/* Special case for empty strings */
				3847	if (PyString_GET_SIZE(self) == 0)
				3848	return PyInt_FromLong(0);
				3849
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3850	e = p + PyUnicode_GET_SIZE(self);
				3851	for (; p < e; p++) {
				3852	if (!Py_UNICODE_ISDECIMAL(*p))
				3853	return PyInt_FromLong(0);
				3854	}
				3855	return PyInt_FromLong(1);
				3856	}
				3857
				3858	static char isdigit__doc__[] =
				3859	"S.isdigit() -> int\n\
				3860	\n\
				3861	Return 1 if there are only digit characters in S,\n\
				3862	0 otherwise.";
				3863
				3864	static PyObject*
				3865	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3866	{
				3867	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3868	register const Py_UNICODE *e;
				3869
				3870	if (!PyArg_NoArgs(args))
				3871	return NULL;
				3872
				3873	/* Shortcut for single character strings */
				3874	if (PyUnicode_GET_SIZE(self) == 1 &&
				3875	Py_UNICODE_ISDIGIT(*p))
				3876	return PyInt_FromLong(1);
				3877
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3878	/* Special case for empty strings */
				3879	if (PyString_GET_SIZE(self) == 0)
				3880	return PyInt_FromLong(0);
				3881
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3882	e = p + PyUnicode_GET_SIZE(self);
				3883	for (; p < e; p++) {
				3884	if (!Py_UNICODE_ISDIGIT(*p))
				3885	return PyInt_FromLong(0);
				3886	}
				3887	return PyInt_FromLong(1);
				3888	}
				3889
				3890	static char isnumeric__doc__[] =
				3891	"S.isnumeric() -> int\n\
				3892	\n\
				3893	Return 1 if there are only numeric characters in S,\n\
				3894	0 otherwise.";
				3895
				3896	static PyObject*
				3897	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3898	{
				3899	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3900	register const Py_UNICODE *e;
				3901
				3902	if (!PyArg_NoArgs(args))
				3903	return NULL;
				3904
				3905	/* Shortcut for single character strings */
				3906	if (PyUnicode_GET_SIZE(self) == 1 &&
				3907	Py_UNICODE_ISNUMERIC(*p))
				3908	return PyInt_FromLong(1);
				3909
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3910	/* Special case for empty strings */
				3911	if (PyString_GET_SIZE(self) == 0)
				3912	return PyInt_FromLong(0);
				3913
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3914	e = p + PyUnicode_GET_SIZE(self);
				3915	for (; p < e; p++) {
				3916	if (!Py_UNICODE_ISNUMERIC(*p))
				3917	return PyInt_FromLong(0);
				3918	}
				3919	return PyInt_FromLong(1);
				3920	}
				3921
				3922	static char join__doc__[] =
				3923	"S.join(sequence) -> unicode\n\
				3924	\n\
				3925	Return a string which is the concatenation of the strings in the\n\
				3926	sequence. The separator between elements is S.";
				3927
				3928	static PyObject*
				3929	unicode_join(PyUnicodeObject self, PyObject args)
				3930	{
				3931	PyObject *data;
				3932	if (!PyArg_ParseTuple(args, "O:join", &data))
				3933	return NULL;
				3934
				3935	return PyUnicode_Join((PyObject *)self, data);
				3936	}
				3937
				3938	static int
				3939	unicode_length(PyUnicodeObject *self)
				3940	{
				3941	return self->length;
				3942	}
				3943
				3944	static char ljust__doc__[] =
				3945	"S.ljust(width) -> unicode\n\
				3946	\n\
				3947	Return S left justified in a Unicode string of length width. Padding is\n\
				3948	done using spaces.";
				3949
				3950	static PyObject *
				3951	unicode_ljust(PyUnicodeObject self, PyObject args)
				3952	{
				3953	int width;
				3954	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3955	return NULL;
				3956
				3957	if (self->length >= width) {
				3958	Py_INCREF(self);
				3959	return (PyObject*) self;
				3960	}
				3961
				3962	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3963	}
				3964
				3965	static char lower__doc__[] =
				3966	"S.lower() -> unicode\n\
				3967	\n\
				3968	Return a copy of the string S converted to lowercase.";
				3969
				3970	static PyObject*
				3971	unicode_lower(PyUnicodeObject self, PyObject args)
				3972	{
				3973	if (!PyArg_NoArgs(args))
				3974	return NULL;
				3975	return fixup(self, fixlower);
				3976	}
				3977
				3978	static char lstrip__doc__[] =
				3979	"S.lstrip() -> unicode\n\
				3980	\n\
				3981	Return a copy of the string S with leading whitespace removed.";
				3982
				3983	static PyObject *
				3984	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3985	{
				3986	if (!PyArg_NoArgs(args))
				3987	return NULL;
				3988	return strip(self, 1, 0);
				3989	}
				3990
				3991	static PyObject*
				3992	unicode_repeat(PyUnicodeObject *str, int len)
				3993	{
				3994	PyUnicodeObject *u;
				3995	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3996	int nchars;
				3997	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3998
				3999	if (len < 0)
				4000	len = 0;
				4001
				4002	if (len == 1) {
				4003	/* no repeat, return original string */
				4004	Py_INCREF(str);
				4005	return (PyObject*) str;
				4006	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4007
				4008	/* ensure # of chars needed doesn't overflow int and # of bytes
				4009	* needed doesn't overflow size_t
				4010	*/
				4011	nchars = len * str->length;
				4012	if (len && nchars / len != str->length) {
				4013	PyErr_SetString(PyExc_OverflowError,
				4014	"repeated string is too long");
				4015	return NULL;
				4016	}
				4017	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4018	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4019	PyErr_SetString(PyExc_OverflowError,
				4020	"repeated string is too long");
				4021	return NULL;
				4022	}
				4023	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4024	if (!u)
				4025	return NULL;
				4026
				4027	p = u->str;
				4028
				4029	while (len-- > 0) {
				4030	Py_UNICODE_COPY(p, str->str, str->length);
				4031	p += str->length;
				4032	}
				4033
				4034	return (PyObject*) u;
				4035	}
				4036
				4037	PyObject PyUnicode_Replace(PyObject obj,
				4038	PyObject *subobj,
				4039	PyObject *replobj,
				4040	int maxcount)
				4041	{
				4042	PyObject *self;
				4043	PyObject *str1;
				4044	PyObject *str2;
				4045	PyObject *result;
				4046
				4047	self = PyUnicode_FromObject(obj);
				4048	if (self == NULL)
				4049	return NULL;
				4050	str1 = PyUnicode_FromObject(subobj);
				4051	if (str1 == NULL) {
				4052	Py_DECREF(self);
				4053	return NULL;
				4054	}
				4055	str2 = PyUnicode_FromObject(replobj);
				4056	if (str2 == NULL) {
				4057	Py_DECREF(self);
				4058	Py_DECREF(str1);
				4059	return NULL;
				4060	}
				4061	result = replace((PyUnicodeObject *)self,
				4062	(PyUnicodeObject *)str1,
				4063	(PyUnicodeObject *)str2,
				4064	maxcount);
				4065	Py_DECREF(self);
				4066	Py_DECREF(str1);
				4067	Py_DECREF(str2);
				4068	return result;
				4069	}
				4070
				4071	static char replace__doc__[] =
				4072	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4073	\n\
				4074	Return a copy of S with all occurrences of substring\n\
				4075	old replaced by new. If the optional argument maxsplit is\n\
				4076	given, only the first maxsplit occurrences are replaced.";
				4077
				4078	static PyObject*
				4079	unicode_replace(PyUnicodeObject self, PyObject args)
				4080	{
				4081	PyUnicodeObject *str1;
				4082	PyUnicodeObject *str2;
				4083	int maxcount = -1;
				4084	PyObject *result;
				4085
				4086	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4087	return NULL;
				4088	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4089	if (str1 == NULL)
				4090	return NULL;
				4091	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4092	if (str2 == NULL)
				4093	return NULL;
				4094
				4095	result = replace(self, str1, str2, maxcount);
				4096
				4097	Py_DECREF(str1);
				4098	Py_DECREF(str2);
				4099	return result;
				4100	}
				4101
				4102	static
				4103	PyObject unicode_repr(PyObject unicode)
				4104	{
				4105	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4106	PyUnicode_GET_SIZE(unicode),
				4107	1);
				4108	}
				4109
				4110	static char rfind__doc__[] =
				4111	"S.rfind(sub [,start [,end]]) -> int\n\
				4112	\n\
				4113	Return the highest index in S where substring sub is found,\n\
				4114	such that sub is contained within s[start,end]. Optional\n\
				4115	arguments start and end are interpreted as in slice notation.\n\
				4116	\n\
				4117	Return -1 on failure.";
				4118
				4119	static PyObject *
				4120	unicode_rfind(PyUnicodeObject self, PyObject args)
				4121	{
				4122	PyUnicodeObject *substring;
				4123	int start = 0;
				4124	int end = INT_MAX;
				4125	PyObject *result;
				4126
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4127	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4128	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4129	return NULL;
				4130	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4131	(PyObject *)substring);
				4132	if (substring == NULL)
				4133	return NULL;
				4134
				4135	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4136
				4137	Py_DECREF(substring);
				4138	return result;
				4139	}
				4140
				4141	static char rindex__doc__[] =
				4142	"S.rindex(sub [,start [,end]]) -> int\n\
				4143	\n\
				4144	Like S.rfind() but raise ValueError when the substring is not found.";
				4145
				4146	static PyObject *
				4147	unicode_rindex(PyUnicodeObject self, PyObject args)
				4148	{
				4149	int result;
				4150	PyUnicodeObject *substring;
				4151	int start = 0;
				4152	int end = INT_MAX;
				4153
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4154	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4155	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4156	return NULL;
				4157	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4158	(PyObject *)substring);
				4159	if (substring == NULL)
				4160	return NULL;
				4161
				4162	result = findstring(self, substring, start, end, -1);
				4163
				4164	Py_DECREF(substring);
				4165	if (result < 0) {
				4166	PyErr_SetString(PyExc_ValueError, "substring not found");
				4167	return NULL;
				4168	}
				4169	return PyInt_FromLong(result);
				4170	}
				4171
				4172	static char rjust__doc__[] =
				4173	"S.rjust(width) -> unicode\n\
				4174	\n\
				4175	Return S right justified in a Unicode string of length width. Padding is\n\
				4176	done using spaces.";
				4177
				4178	static PyObject *
				4179	unicode_rjust(PyUnicodeObject self, PyObject args)
				4180	{
				4181	int width;
				4182	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4183	return NULL;
				4184
				4185	if (self->length >= width) {
				4186	Py_INCREF(self);
				4187	return (PyObject*) self;
				4188	}
				4189
				4190	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4191	}
				4192
				4193	static char rstrip__doc__[] =
				4194	"S.rstrip() -> unicode\n\
				4195	\n\
				4196	Return a copy of the string S with trailing whitespace removed.";
				4197
				4198	static PyObject *
				4199	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4200	{
				4201	if (!PyArg_NoArgs(args))
				4202	return NULL;
				4203	return strip(self, 0, 1);
				4204	}
				4205
				4206	static PyObject*
				4207	unicode_slice(PyUnicodeObject *self, int start, int end)
				4208	{
				4209	/* standard clamping */
				4210	if (start < 0)
				4211	start = 0;
				4212	if (end < 0)
				4213	end = 0;
				4214	if (end > self->length)
				4215	end = self->length;
				4216	if (start == 0 && end == self->length) {
				4217	/* full slice, return original string */
				4218	Py_INCREF(self);
				4219	return (PyObject*) self;
				4220	}
				4221	if (start > end)
				4222	start = end;
				4223	/* copy slice */
				4224	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4225	end - start);
				4226	}
				4227
				4228	PyObject PyUnicode_Split(PyObject s,
				4229	PyObject *sep,
				4230	int maxsplit)
				4231	{
				4232	PyObject *result;
				4233
				4234	s = PyUnicode_FromObject(s);
				4235	if (s == NULL)
				4236	return NULL;
				4237	if (sep != NULL) {
				4238	sep = PyUnicode_FromObject(sep);
				4239	if (sep == NULL) {
				4240	Py_DECREF(s);
				4241	return NULL;
				4242	}
				4243	}
				4244
				4245	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4246
				4247	Py_DECREF(s);
				4248	Py_XDECREF(sep);
				4249	return result;
				4250	}
				4251
				4252	static char split__doc__[] =
				4253	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4254	\n\
				4255	Return a list of the words in S, using sep as the\n\
				4256	delimiter string. If maxsplit is given, at most maxsplit\n\
				4257	splits are done. If sep is not specified, any whitespace string\n\
				4258	is a separator.";
				4259
				4260	static PyObject*
				4261	unicode_split(PyUnicodeObject self, PyObject args)
				4262	{
				4263	PyObject *substring = Py_None;
				4264	int maxcount = -1;
				4265
				4266	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4267	return NULL;
				4268
				4269	if (substring == Py_None)
				4270	return split(self, NULL, maxcount);
				4271	else if (PyUnicode_Check(substring))
				4272	return split(self, (PyUnicodeObject *)substring, maxcount);
				4273	else
				4274	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4275	}
				4276
				4277	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4278	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4279	\n\
				4280	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4281	Line breaks are not included in the resulting list unless keepends\n\
				4282	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4283
				4284	static PyObject*
				4285	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4286	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4287	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4288
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4289	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4290	return NULL;
				4291
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4292	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4293	}
				4294
				4295	static
				4296	PyObject unicode_str(PyUnicodeObject self)
				4297	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4298	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4299	}
				4300
				4301	static char strip__doc__[] =
				4302	"S.strip() -> unicode\n\
				4303	\n\
				4304	Return a copy of S with leading and trailing whitespace removed.";
				4305
				4306	static PyObject *
				4307	unicode_strip(PyUnicodeObject self, PyObject args)
				4308	{
				4309	if (!PyArg_NoArgs(args))
				4310	return NULL;
				4311	return strip(self, 1, 1);
				4312	}
				4313
				4314	static char swapcase__doc__[] =
				4315	"S.swapcase() -> unicode\n\
				4316	\n\
				4317	Return a copy of S with uppercase characters converted to lowercase\n\
				4318	and vice versa.";
				4319
				4320	static PyObject*
				4321	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4322	{
				4323	if (!PyArg_NoArgs(args))
				4324	return NULL;
				4325	return fixup(self, fixswapcase);
				4326	}
				4327
				4328	static char translate__doc__[] =
				4329	"S.translate(table) -> unicode\n\
				4330	\n\
				4331	Return a copy of the string S, where all characters have been mapped\n\
				4332	through the given translation table, which must be a mapping of\n\
				4333	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4334	are left untouched. Characters mapped to None are deleted.";
				4335
				4336	static PyObject*
				4337	unicode_translate(PyUnicodeObject self, PyObject args)
				4338	{
				4339	PyObject *table;
				4340
				4341	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4342	return NULL;
				4343	return PyUnicode_TranslateCharmap(self->str,
				4344	self->length,
				4345	table,
				4346	"ignore");
				4347	}
				4348
				4349	static char upper__doc__[] =
				4350	"S.upper() -> unicode\n\
				4351	\n\
				4352	Return a copy of S converted to uppercase.";
				4353
				4354	static PyObject*
				4355	unicode_upper(PyUnicodeObject self, PyObject args)
				4356	{
				4357	if (!PyArg_NoArgs(args))
				4358	return NULL;
				4359	return fixup(self, fixupper);
				4360	}
				4361
				4362	#if 0
				4363	static char zfill__doc__[] =
				4364	"S.zfill(width) -> unicode\n\
				4365	\n\
				4366	Pad a numeric string x with zeros on the left, to fill a field\n\
				4367	of the specified width. The string x is never truncated.";
				4368
				4369	static PyObject *
				4370	unicode_zfill(PyUnicodeObject self, PyObject args)
				4371	{
				4372	int fill;
				4373	PyUnicodeObject *u;
				4374
				4375	int width;
				4376	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4377	return NULL;
				4378
				4379	if (self->length >= width) {
				4380	Py_INCREF(self);
				4381	return (PyObject*) self;
				4382	}
				4383
				4384	fill = width - self->length;
				4385
				4386	u = pad(self, fill, 0, '0');
				4387
				4388	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4389	/* move sign to beginning of string */
				4390	u->str[0] = u->str[fill];
				4391	u->str[fill] = '0';
				4392	}
				4393
				4394	return (PyObject*) u;
				4395	}
				4396	#endif
				4397
				4398	#if 0
				4399	static PyObject*
				4400	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4401	{
				4402	if (!PyArg_NoArgs(args))
				4403	return NULL;
				4404	return PyInt_FromLong(unicode_freelist_size);
				4405	}
				4406	#endif
				4407
				4408	static char startswith__doc__[] =
				4409	"S.startswith(prefix[, start[, end]]) -> int\n\
				4410	\n\
				4411	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4412	optional start, test S beginning at that position. With optional end, stop\n\
				4413	comparing S at that position.";
				4414
				4415	static PyObject *
				4416	unicode_startswith(PyUnicodeObject *self,
				4417	PyObject *args)
				4418	{
				4419	PyUnicodeObject *substring;
				4420	int start = 0;
				4421	int end = INT_MAX;
				4422	PyObject *result;
				4423
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4424	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4425	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4426	return NULL;
				4427	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4428	(PyObject *)substring);
				4429	if (substring == NULL)
				4430	return NULL;
				4431
				4432	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4433
				4434	Py_DECREF(substring);
				4435	return result;
				4436	}
				4437
				4438
				4439	static char endswith__doc__[] =
				4440	"S.endswith(suffix[, start[, end]]) -> int\n\
				4441	\n\
				4442	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4443	optional start, test S beginning at that position. With optional end, stop\n\
				4444	comparing S at that position.";
				4445
				4446	static PyObject *
				4447	unicode_endswith(PyUnicodeObject *self,
				4448	PyObject *args)
				4449	{
				4450	PyUnicodeObject *substring;
				4451	int start = 0;
				4452	int end = INT_MAX;
				4453	PyObject *result;
				4454
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4455	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4456	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4457	return NULL;
				4458	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4459	(PyObject *)substring);
				4460	if (substring == NULL)
				4461	return NULL;
				4462
				4463	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4464
				4465	Py_DECREF(substring);
				4466	return result;
				4467	}
				4468
				4469
				4470	static PyMethodDef unicode_methods[] = {
				4471
				4472	/* Order is according to common usage: often used methods should
				4473	appear first, since lookup is done sequentially. */
				4474
				4475	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4476	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4477	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4478	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4479	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4480	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4481	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4482	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4483	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4484	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4485	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4486	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4487	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4488	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4489	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4490	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4491	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4492	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4493	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4494	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4495	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4496	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4497	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4498	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4499	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4500	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4501	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4502	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4503	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4504	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4505	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4506	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4507	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4508	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4509	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4510	#if 0
				4511	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4512	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4513	#endif
				4514
				4515	#if 0
				4516	/* This one is just used for debugging the implementation. */
				4517	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4518	#endif
				4519
				4520	{NULL, NULL}
				4521	};
				4522
				4523	static PyObject *
				4524	unicode_getattr(PyUnicodeObject self, char name)
				4525	{
				4526	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4527	}
				4528
				4529	static PySequenceMethods unicode_as_sequence = {
				4530	(inquiry) unicode_length, /* sq_length */
				4531	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4532	(intargfunc) unicode_repeat, /* sq_repeat */
				4533	(intargfunc) unicode_getitem, /* sq_item */
				4534	(intintargfunc) unicode_slice, /* sq_slice */
				4535	0, /* sq_ass_item */
				4536	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4537	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4538	};
				4539
				4540	static int
				4541	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4542	int index,
				4543	const void **ptr)
				4544	{
				4545	if (index != 0) {
				4546	PyErr_SetString(PyExc_SystemError,
				4547	"accessing non-existent unicode segment");
				4548	return -1;
				4549	}
				4550	ptr = (void ) self->str;
				4551	return PyUnicode_GET_DATA_SIZE(self);
				4552	}
				4553
				4554	static int
				4555	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4556	const void **ptr)
				4557	{
				4558	PyErr_SetString(PyExc_TypeError,
				4559	"cannot use unicode as modifyable buffer");
				4560	return -1;
				4561	}
				4562
				4563	static int
				4564	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4565	int *lenp)
				4566	{
				4567	if (lenp)
				4568	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4569	return 1;
				4570	}
				4571
				4572	static int
				4573	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4574	int index,
				4575	const void **ptr)
				4576	{
				4577	PyObject *str;
				4578
				4579	if (index != 0) {
				4580	PyErr_SetString(PyExc_SystemError,
				4581	"accessing non-existent unicode segment");
				4582	return -1;
				4583	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4584	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4585	if (str == NULL)
				4586	return -1;
				4587	ptr = (void ) PyString_AS_STRING(str);
				4588	return PyString_GET_SIZE(str);
				4589	}
				4590
				4591	/* Helpers for PyUnicode_Format() */
				4592
				4593	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4594	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4595	{
				4596	int argidx = *p_argidx;
				4597	if (argidx < arglen) {
				4598	(*p_argidx)++;
				4599	if (arglen < 0)
				4600	return args;
				4601	else
				4602	return PyTuple_GetItem(args, argidx);
				4603	}
				4604	PyErr_SetString(PyExc_TypeError,
				4605	"not enough arguments for format string");
				4606	return NULL;
				4607	}
				4608
				4609	#define F_LJUST (1<<0)
				4610	#define F_SIGN (1<<1)
				4611	#define F_BLANK (1<<2)
				4612	#define F_ALT (1<<3)
				4613	#define F_ZERO (1<<4)
				4614
				4615	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4616	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4617	{
				4618	register int i;
				4619	int len;
				4620	va_list va;
				4621	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4622	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4623
				4624	/* First, format the string as char array, then expand to Py_UNICODE
				4625	array. */
				4626	charbuffer = (char *)buffer;
				4627	len = vsprintf(charbuffer, format, va);
				4628	for (i = len - 1; i >= 0; i--)
				4629	buffer[i] = (Py_UNICODE) charbuffer[i];
				4630
				4631	va_end(va);
				4632	return len;
				4633	}
				4634
				4635	static int
				4636	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4637	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4638	int flags,
				4639	int prec,
				4640	int type,
				4641	PyObject *v)
				4642	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4643	/* fmt = '%#.' + `prec` + `type`
				4644	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4645	char fmt[20];
				4646	double x;
				4647
				4648	x = PyFloat_AsDouble(v);
				4649	if (x == -1.0 && PyErr_Occurred())
				4650	return -1;
				4651	if (prec < 0)
				4652	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4653	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4654	type = 'g';
				4655	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4656	/* worst case length calc to ensure no buffer overrun:
				4657	fmt = %#.<prec>g
				4658	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4659	for any double rep.)
				4660	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4661	If prec=0 the effective precision is 1 (the leading digit is
				4662	always given), therefore increase by one to 10+prec. */
				4663	if (buflen <= (size_t)10 + (size_t)prec) {
				4664	PyErr_SetString(PyExc_OverflowError,
				4665	"formatted float is too long (precision too long?)");
				4666	return -1;
				4667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4668	return usprintf(buf, fmt, x);
				4669	}
				4670
				4671	static int
				4672	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4673	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4674	int flags,
				4675	int prec,
				4676	int type,
				4677	PyObject *v)
				4678	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4679	/* fmt = '%#.' + `prec` + 'l' + `type`
				4680	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4681	char fmt[20];
				4682	long x;
				4683
				4684	x = PyInt_AsLong(v);
				4685	if (x == -1 && PyErr_Occurred())
				4686	return -1;
				4687	if (prec < 0)
				4688	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4689	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4690	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4691	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4692	PyErr_SetString(PyExc_OverflowError,
				4693	"formatted integer is too long (precision too long?)");
				4694	return -1;
				4695	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4696	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4697	return usprintf(buf, fmt, x);
				4698	}
				4699
				4700	static int
				4701	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4702	size_t buflen,
				4703	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4704	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4705	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4706	if (PyUnicode_Check(v)) {
				4707	if (PyUnicode_GET_SIZE(v) != 1)
				4708	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4709	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4710	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4711
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4712	else if (PyString_Check(v)) {
				4713	if (PyString_GET_SIZE(v) != 1)
				4714	goto onError;
				4715	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4716	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4717
				4718	else {
				4719	/* Integer input truncated to a character */
				4720	long x;
				4721	x = PyInt_AsLong(v);
				4722	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4723	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4724	buf[0] = (char) x;
				4725	}
				4726	buf[1] = '\0';
				4727	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4728
				4729	onError:
				4730	PyErr_SetString(PyExc_TypeError,
				4731	"%c requires int or char");
				4732	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733	}
				4734
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4735	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4736
				4737	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4738	chars are formatted. XXX This is a magic number. Each formatting
				4739	routine does bounds checking to ensure no overflow, but a better
				4740	solution may be to malloc a buffer of appropriate size for each
				4741	format. For now, the current solution is sufficient.
				4742	*/
				4743	#define FORMATBUFLEN (size_t)120
				4744
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4745	PyObject PyUnicode_Format(PyObject format,
				4746	PyObject *args)
				4747	{
				4748	Py_UNICODE fmt, res;
				4749	int fmtcnt, rescnt, reslen, arglen, argidx;
				4750	int args_owned = 0;
				4751	PyUnicodeObject *result = NULL;
				4752	PyObject *dict = NULL;
				4753	PyObject *uformat;
				4754
				4755	if (format == NULL \|\| args == NULL) {
				4756	PyErr_BadInternalCall();
				4757	return NULL;
				4758	}
				4759	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4760	if (uformat == NULL)
				4761	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4762	fmt = PyUnicode_AS_UNICODE(uformat);
				4763	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4764
				4765	reslen = rescnt = fmtcnt + 100;
				4766	result = _PyUnicode_New(reslen);
				4767	if (result == NULL)
				4768	goto onError;
				4769	res = PyUnicode_AS_UNICODE(result);
				4770
				4771	if (PyTuple_Check(args)) {
				4772	arglen = PyTuple_Size(args);
				4773	argidx = 0;
				4774	}
				4775	else {
				4776	arglen = -1;
				4777	argidx = -2;
				4778	}
				4779	if (args->ob_type->tp_as_mapping)
				4780	dict = args;
				4781
				4782	while (--fmtcnt >= 0) {
				4783	if (*fmt != '%') {
				4784	if (--rescnt < 0) {
				4785	rescnt = fmtcnt + 100;
				4786	reslen += rescnt;
				4787	if (_PyUnicode_Resize(result, reslen) < 0)
				4788	return NULL;
				4789	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4790	--rescnt;
				4791	}
				4792	res++ = fmt++;
				4793	}
				4794	else {
				4795	/* Got a format specifier */
				4796	int flags = 0;
				4797	int width = -1;
				4798	int prec = -1;
				4799	int size = 0;
				4800	Py_UNICODE c = '\0';
				4801	Py_UNICODE fill;
				4802	PyObject *v = NULL;
				4803	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4804	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4805	Py_UNICODE sign;
				4806	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4807	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4808
				4809	fmt++;
				4810	if (*fmt == '(') {
				4811	Py_UNICODE *keystart;
				4812	int keylen;
				4813	PyObject *key;
				4814	int pcount = 1;
				4815
				4816	if (dict == NULL) {
				4817	PyErr_SetString(PyExc_TypeError,
				4818	"format requires a mapping");
				4819	goto onError;
				4820	}
				4821	++fmt;
				4822	--fmtcnt;
				4823	keystart = fmt;
				4824	/* Skip over balanced parentheses */
				4825	while (pcount > 0 && --fmtcnt >= 0) {
				4826	if (*fmt == ')')
				4827	--pcount;
				4828	else if (*fmt == '(')
				4829	++pcount;
				4830	fmt++;
				4831	}
				4832	keylen = fmt - keystart - 1;
				4833	if (fmtcnt < 0 \|\| pcount > 0) {
				4834	PyErr_SetString(PyExc_ValueError,
				4835	"incomplete format key");
				4836	goto onError;
				4837	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4838	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4839	then looked up since Python uses strings to hold
				4840	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4841	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4842	key = PyUnicode_EncodeUTF8(keystart,
				4843	keylen,
				4844	NULL);
				4845	if (key == NULL)
				4846	goto onError;
				4847	if (args_owned) {
				4848	Py_DECREF(args);
				4849	args_owned = 0;
				4850	}
				4851	args = PyObject_GetItem(dict, key);
				4852	Py_DECREF(key);
				4853	if (args == NULL) {
				4854	goto onError;
				4855	}
				4856	args_owned = 1;
				4857	arglen = -1;
				4858	argidx = -2;
				4859	}
				4860	while (--fmtcnt >= 0) {
				4861	switch (c = *fmt++) {
				4862	case '-': flags \|= F_LJUST; continue;
				4863	case '+': flags \|= F_SIGN; continue;
				4864	case ' ': flags \|= F_BLANK; continue;
				4865	case '#': flags \|= F_ALT; continue;
				4866	case '0': flags \|= F_ZERO; continue;
				4867	}
				4868	break;
				4869	}
				4870	if (c == '*') {
				4871	v = getnextarg(args, arglen, &argidx);
				4872	if (v == NULL)
				4873	goto onError;
				4874	if (!PyInt_Check(v)) {
				4875	PyErr_SetString(PyExc_TypeError,
				4876	"* wants int");
				4877	goto onError;
				4878	}
				4879	width = PyInt_AsLong(v);
				4880	if (width < 0) {
				4881	flags \|= F_LJUST;
				4882	width = -width;
				4883	}
				4884	if (--fmtcnt >= 0)
				4885	c = *fmt++;
				4886	}
				4887	else if (c >= '0' && c <= '9') {
				4888	width = c - '0';
				4889	while (--fmtcnt >= 0) {
				4890	c = *fmt++;
				4891	if (c < '0' \|\| c > '9')
				4892	break;
				4893	if ((width*10) / 10 != width) {
				4894	PyErr_SetString(PyExc_ValueError,
				4895	"width too big");
				4896	goto onError;
				4897	}
				4898	width = width*10 + (c - '0');
				4899	}
				4900	}
				4901	if (c == '.') {
				4902	prec = 0;
				4903	if (--fmtcnt >= 0)
				4904	c = *fmt++;
				4905	if (c == '*') {
				4906	v = getnextarg(args, arglen, &argidx);
				4907	if (v == NULL)
				4908	goto onError;
				4909	if (!PyInt_Check(v)) {
				4910	PyErr_SetString(PyExc_TypeError,
				4911	"* wants int");
				4912	goto onError;
				4913	}
				4914	prec = PyInt_AsLong(v);
				4915	if (prec < 0)
				4916	prec = 0;
				4917	if (--fmtcnt >= 0)
				4918	c = *fmt++;
				4919	}
				4920	else if (c >= '0' && c <= '9') {
				4921	prec = c - '0';
				4922	while (--fmtcnt >= 0) {
				4923	c = Py_CHARMASK(*fmt++);
				4924	if (c < '0' \|\| c > '9')
				4925	break;
				4926	if ((prec*10) / 10 != prec) {
				4927	PyErr_SetString(PyExc_ValueError,
				4928	"prec too big");
				4929	goto onError;
				4930	}
				4931	prec = prec*10 + (c - '0');
				4932	}
				4933	}
				4934	} /* prec */
				4935	if (fmtcnt >= 0) {
				4936	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4937	size = c;
				4938	if (--fmtcnt >= 0)
				4939	c = *fmt++;
				4940	}
				4941	}
				4942	if (fmtcnt < 0) {
				4943	PyErr_SetString(PyExc_ValueError,
				4944	"incomplete format");
				4945	goto onError;
				4946	}
				4947	if (c != '%') {
				4948	v = getnextarg(args, arglen, &argidx);
				4949	if (v == NULL)
				4950	goto onError;
				4951	}
				4952	sign = 0;
				4953	fill = ' ';
				4954	switch (c) {
				4955
				4956	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4957	pbuf = formatbuf;
				4958	/* presume that buffer length is at least 1 */
				4959	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4960	len = 1;
				4961	break;
				4962
				4963	case 's':
				4964	case 'r':
				4965	if (PyUnicode_Check(v) && c == 's') {
				4966	temp = v;
				4967	Py_INCREF(temp);
				4968	}
				4969	else {
				4970	PyObject *unicode;
				4971	if (c == 's')
				4972	temp = PyObject_Str(v);
				4973	else
				4974	temp = PyObject_Repr(v);
				4975	if (temp == NULL)
				4976	goto onError;
				4977	if (!PyString_Check(temp)) {
				4978	/* XXX Note: this should never happen, since
				4979	PyObject_Repr() and PyObject_Str() assure
				4980	this */
				4981	Py_DECREF(temp);
				4982	PyErr_SetString(PyExc_TypeError,
				4983	"%s argument has non-string str()");
				4984	goto onError;
				4985	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4986	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4987	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4988	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4989	"strict");
				4990	Py_DECREF(temp);
				4991	temp = unicode;
				4992	if (temp == NULL)
				4993	goto onError;
				4994	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4995	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4996	len = PyUnicode_GET_SIZE(temp);
				4997	if (prec >= 0 && len > prec)
				4998	len = prec;
				4999	break;
				5000
				5001	case 'i':
				5002	case 'd':
				5003	case 'u':
				5004	case 'o':
				5005	case 'x':
				5006	case 'X':
				5007	if (c == 'i')
				5008	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5009	pbuf = formatbuf;
				5010	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5011	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5012	if (len < 0)
				5013	goto onError;
				5014	sign = (c == 'd');
				5015	if (flags & F_ZERO) {
				5016	fill = '0';
				5017	if ((flags&F_ALT) &&
				5018	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5019	pbuf[0] == '0' && pbuf[1] == c) {
				5020	res++ = pbuf++;
				5021	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5022	rescnt -= 2;
				5023	len -= 2;
				5024	width -= 2;
				5025	if (width < 0)
				5026	width = 0;
				5027	}
				5028	}
				5029	break;
				5030
				5031	case 'e':
				5032	case 'E':
				5033	case 'f':
				5034	case 'g':
				5035	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5036	pbuf = formatbuf;
				5037	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5038	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5039	if (len < 0)
				5040	goto onError;
				5041	sign = 1;
				5042	if (flags&F_ZERO)
				5043	fill = '0';
				5044	break;
				5045
				5046	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5047	pbuf = formatbuf;
				5048	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5049	if (len < 0)
				5050	goto onError;
				5051	break;
				5052
				5053	default:
				5054	PyErr_Format(PyExc_ValueError,
				5055	"unsupported format character '%c' (0x%x)",
				5056	c, c);
				5057	goto onError;
				5058	}
				5059	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5060	if (pbuf == '-' \|\| pbuf == '+') {
				5061	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5062	len--;
				5063	}
				5064	else if (flags & F_SIGN)
				5065	sign = '+';
				5066	else if (flags & F_BLANK)
				5067	sign = ' ';
				5068	else
				5069	sign = 0;
				5070	}
				5071	if (width < len)
				5072	width = len;
				5073	if (rescnt < width + (sign != 0)) {
				5074	reslen -= rescnt;
				5075	rescnt = width + fmtcnt + 100;
				5076	reslen += rescnt;
				5077	if (_PyUnicode_Resize(result, reslen) < 0)
				5078	return NULL;
				5079	res = PyUnicode_AS_UNICODE(result)
				5080	+ reslen - rescnt;
				5081	}
				5082	if (sign) {
				5083	if (fill != ' ')
				5084	*res++ = sign;
				5085	rescnt--;
				5086	if (width > len)
				5087	width--;
				5088	}
				5089	if (width > len && !(flags & F_LJUST)) {
				5090	do {
				5091	--rescnt;
				5092	*res++ = fill;
				5093	} while (--width > len);
				5094	}
				5095	if (sign && fill == ' ')
				5096	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5097	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5098	res += len;
				5099	rescnt -= len;
				5100	while (--width >= len) {
				5101	--rescnt;
				5102	*res++ = ' ';
				5103	}
				5104	if (dict && (argidx < arglen) && c != '%') {
				5105	PyErr_SetString(PyExc_TypeError,
				5106	"not all arguments converted");
				5107	goto onError;
				5108	}
				5109	Py_XDECREF(temp);
				5110	} /* '%' */
				5111	} /* until end */
				5112	if (argidx < arglen && !dict) {
				5113	PyErr_SetString(PyExc_TypeError,
				5114	"not all arguments converted");
				5115	goto onError;
				5116	}
				5117
				5118	if (args_owned) {
				5119	Py_DECREF(args);
				5120	}
				5121	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5122	if (_PyUnicode_Resize(result, reslen - rescnt))
				5123	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5124	return (PyObject *)result;
				5125
				5126	onError:
				5127	Py_XDECREF(result);
				5128	Py_DECREF(uformat);
				5129	if (args_owned) {
				5130	Py_DECREF(args);
				5131	}
				5132	return NULL;
				5133	}
				5134
				5135	static PyBufferProcs unicode_as_buffer = {
				5136	(getreadbufferproc) unicode_buffer_getreadbuf,
				5137	(getwritebufferproc) unicode_buffer_getwritebuf,
				5138	(getsegcountproc) unicode_buffer_getsegcount,
				5139	(getcharbufferproc) unicode_buffer_getcharbuf,
				5140	};
				5141
				5142	PyTypeObject PyUnicode_Type = {
				5143	PyObject_HEAD_INIT(&PyType_Type)
				5144	0, /* ob_size */
				5145	"unicode", /* tp_name */
				5146	sizeof(PyUnicodeObject), /* tp_size */
				5147	0, /* tp_itemsize */
				5148	/* Slots */
				5149	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5150	0, /* tp_print */
				5151	(getattrfunc)unicode_getattr, /* tp_getattr */
				5152	0, /* tp_setattr */
				5153	(cmpfunc) unicode_compare, /* tp_compare */
				5154	(reprfunc) unicode_repr, /* tp_repr */
				5155	0, /* tp_as_number */
				5156	&unicode_as_sequence, /* tp_as_sequence */
				5157	0, /* tp_as_mapping */
				5158	(hashfunc) unicode_hash, /* tp_hash*/
				5159	0, /* tp_call*/
				5160	(reprfunc) unicode_str, /* tp_str */
				5161	(getattrofunc) NULL, /* tp_getattro */
				5162	(setattrofunc) NULL, /* tp_setattro */
				5163	&unicode_as_buffer, /* tp_as_buffer */
				5164	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5165	};
				5166
				5167	/* Initialize the Unicode implementation */
				5168
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5169	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5170	{
				5171	/* Doublecheck the configuration... */
				5172	if (sizeof(Py_UNICODE) != 2)
				5173	Py_FatalError("Unicode configuration error: "
				5174	"sizeof(Py_UNICODE) != 2 bytes");
				5175
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5176	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5177	unicode_freelist = NULL;
				5178	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5179	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5180	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5181	}
				5182
				5183	/* Finalize the Unicode implementation */
				5184
				5185	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5186	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5187	{
				5188	PyUnicodeObject *u = unicode_freelist;
				5189
				5190	while (u != NULL) {
				5191	PyUnicodeObject *v = u;
				5192	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5193	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5194	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5195	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5196	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5197	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5198	unicode_freelist = NULL;
				5199	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5200	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5201	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5202	}