Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 1d35c3d3805ff02d5a1366b10c5303f6cd891cfe [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	168	if (unicode->defenc) {
				169	Py_DECREF(unicode->defenc);
				170	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	216	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	217	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	219	/* Keep-Alive optimization: we only upsize the buffer,
				220	never downsize it. */
				221	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	223	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	224	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	227	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	229	}
				230	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	231	}
				232	else {
				233	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				234	if (unicode == NULL)
				235	return NULL;
				236	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				237	}
				238
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	239	if (!unicode->str) {
				240	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	241	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	unicode->str[length] = 0;
				244	unicode->length = length;
				245	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	246	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	248
				249	onError:
				250	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	251	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	252	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254
				255	static
				256	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				257	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	259	/* Keep-Alive optimization */
				260	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	261	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	unicode->str = NULL;
				263	unicode->length = 0;
				264	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	265	if (unicode->defenc) {
				266	Py_DECREF(unicode->defenc);
				267	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	268	}
				269	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	270	(PyUnicodeObject *)unicode = unicode_freelist;
				271	unicode_freelist = unicode;
				272	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	273	}
				274	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	275	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	276	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	277	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	278	}
				279	}
				280
				281	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				282	int size)
				283	{
				284	PyUnicodeObject *unicode;
				285
				286	unicode = _PyUnicode_New(size);
				287	if (!unicode)
				288	return NULL;
				289
				290	/* Copy the Unicode data into the new object */
				291	if (u != NULL)
				292	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	#ifdef HAVE_WCHAR_H
				298
				299	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				300	int size)
				301	{
				302	PyUnicodeObject *unicode;
				303
				304	if (w == NULL) {
				305	PyErr_BadInternalCall();
				306	return NULL;
				307	}
				308
				309	unicode = _PyUnicode_New(size);
				310	if (!unicode)
				311	return NULL;
				312
				313	/* Copy the wchar_t data into the new object */
				314	#ifdef HAVE_USABLE_WCHAR_T
				315	memcpy(unicode->str, w, size * sizeof(wchar_t));
				316	#else
				317	{
				318	register Py_UNICODE *u;
				319	register int i;
				320	u = PyUnicode_AS_UNICODE(unicode);
				321	for (i = size; i >= 0; i--)
				322	u++ = w++;
				323	}
				324	#endif
				325
				326	return (PyObject *)unicode;
				327	}
				328
				329	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				330	register wchar_t *w,
				331	int size)
				332	{
				333	if (unicode == NULL) {
				334	PyErr_BadInternalCall();
				335	return -1;
				336	}
				337	if (size > PyUnicode_GET_SIZE(unicode))
				338	size = PyUnicode_GET_SIZE(unicode);
				339	#ifdef HAVE_USABLE_WCHAR_T
				340	memcpy(w, unicode->str, size * sizeof(wchar_t));
				341	#else
				342	{
				343	register Py_UNICODE *u;
				344	register int i;
				345	u = PyUnicode_AS_UNICODE(unicode);
				346	for (i = size; i >= 0; i--)
				347	w++ = u++;
				348	}
				349	#endif
				350
				351	return size;
				352	}
				353
				354	#endif
				355
				356	PyObject PyUnicode_FromObject(register PyObject obj)
				357	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	358	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				359	}
				360
				361	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				362	const char *encoding,
				363	const char *errors)
				364	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	365	const char *s;
				366	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	367	int owned = 0;
				368	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	369
				370	if (obj == NULL) {
				371	PyErr_BadInternalCall();
				372	return NULL;
				373	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	374
				375	/* Coerce object */
				376	if (PyInstance_Check(obj)) {
				377	PyObject *func;
				378	func = PyObject_GetAttrString(obj, "__str__");
				379	if (func == NULL) {
				380	PyErr_SetString(PyExc_TypeError,
				381	"coercing to Unicode: instance doesn't define __str__");
				382	return NULL;
				383	}
				384	obj = PyEval_CallObject(func, NULL);
				385	Py_DECREF(func);
				386	if (obj == NULL)
				387	return NULL;
				388	owned = 1;
				389	}
				390	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	391	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	392	v = obj;
				393	if (encoding) {
				394	PyErr_SetString(PyExc_TypeError,
				395	"decoding Unicode is not supported");
				396	return NULL;
				397	}
				398	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399	}
				400	else if (PyString_Check(obj)) {
				401	s = PyString_AS_STRING(obj);
				402	len = PyString_GET_SIZE(obj);
				403	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	404	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				405	/* Overwrite the error message with something more useful in
				406	case of a TypeError. */
				407	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	408	PyErr_Format(PyExc_TypeError,
				409	"coercing to Unicode: need string or buffer, "
				410	"%.80s found",
				411	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414
				415	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	if (len == 0) {
				417	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	else
				421	v = PyUnicode_Decode(s, len, encoding, errors);
				422	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return v;
				427
				428	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	429	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	430	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	431	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	432	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	433	}
				434
				435	PyObject PyUnicode_Decode(const char s,
				436	int size,
				437	const char *encoding,
				438	const char *errors)
				439	{
				440	PyObject buffer = NULL, unicode;
				441
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	if (encoding == NULL)
				443	encoding = PyUnicode_GetDefaultEncoding();
				444
				445	/* Shortcuts for common default encodings */
				446	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	447	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	448	else if (strcmp(encoding, "latin-1") == 0)
				449	return PyUnicode_DecodeLatin1(s, size, errors);
				450	else if (strcmp(encoding, "ascii") == 0)
				451	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452
				453	/* Decode via the codec registry */
				454	buffer = PyBuffer_FromMemory((void *)s, size);
				455	if (buffer == NULL)
				456	goto onError;
				457	unicode = PyCodec_Decode(buffer, encoding, errors);
				458	if (unicode == NULL)
				459	goto onError;
				460	if (!PyUnicode_Check(unicode)) {
				461	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	462	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	463	unicode->ob_type->tp_name);
				464	Py_DECREF(unicode);
				465	goto onError;
				466	}
				467	Py_DECREF(buffer);
				468	return unicode;
				469
				470	onError:
				471	Py_XDECREF(buffer);
				472	return NULL;
				473	}
				474
				475	PyObject PyUnicode_Encode(const Py_UNICODE s,
				476	int size,
				477	const char *encoding,
				478	const char *errors)
				479	{
				480	PyObject v, unicode;
				481
				482	unicode = PyUnicode_FromUnicode(s, size);
				483	if (unicode == NULL)
				484	return NULL;
				485	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				486	Py_DECREF(unicode);
				487	return v;
				488	}
				489
				490	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				491	const char *encoding,
				492	const char *errors)
				493	{
				494	PyObject *v;
				495
				496	if (!PyUnicode_Check(unicode)) {
				497	PyErr_BadArgument();
				498	goto onError;
				499	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	500
				501	if (encoding == NULL)
				502	encoding = PyUnicode_GetDefaultEncoding();
				503
				504	/* Shortcuts for common default encodings */
				505	if (errors == NULL) {
				506	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	508	else if (strcmp(encoding, "latin-1") == 0)
				509	return PyUnicode_AsLatin1String(unicode);
				510	else if (strcmp(encoding, "ascii") == 0)
				511	return PyUnicode_AsASCIIString(unicode);
				512	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	513
				514	/* Encode via the codec registry */
				515	v = PyCodec_Encode(unicode, encoding, errors);
				516	if (v == NULL)
				517	goto onError;
				518	/* XXX Should we really enforce this ? */
				519	if (!PyString_Check(v)) {
				520	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	521	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	522	v->ob_type->tp_name);
				523	Py_DECREF(v);
				524	goto onError;
				525	}
				526	return v;
				527
				528	onError:
				529	return NULL;
				530	}
				531
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	532	/* Return a Python string holding the default encoded value of the
				533	Unicode object.
				534
				535	The resulting string is cached in the Unicode object for subsequent
				536	usage by this function. The cached version is needed to implement
				537	the character buffer interface and will live (at least) as long as
				538	the Unicode object itself.
				539
				540	The refcount of the string is not incremented.
				541
				542	* Exported for internal use by the interpreter only !!! *
				543
				544	*/
				545
				546	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				547	const char *errors)
				548	{
				549	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				550
				551	if (v)
				552	return v;
				553	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				554	if (v && errors == NULL)
				555	((PyUnicodeObject *)unicode)->defenc = v;
				556	return v;
				557	}
				558
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				560	{
				561	if (!PyUnicode_Check(unicode)) {
				562	PyErr_BadArgument();
				563	goto onError;
				564	}
				565	return PyUnicode_AS_UNICODE(unicode);
				566
				567	onError:
				568	return NULL;
				569	}
				570
				571	int PyUnicode_GetSize(PyObject *unicode)
				572	{
				573	if (!PyUnicode_Check(unicode)) {
				574	PyErr_BadArgument();
				575	goto onError;
				576	}
				577	return PyUnicode_GET_SIZE(unicode);
				578
				579	onError:
				580	return -1;
				581	}
				582
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	583	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	584	{
				585	return unicode_default_encoding;
				586	}
				587
				588	int PyUnicode_SetDefaultEncoding(const char *encoding)
				589	{
				590	PyObject *v;
				591
				592	/* Make sure the encoding is valid. As side effect, this also
				593	loads the encoding into the codec registry cache. */
				594	v = _PyCodec_Lookup(encoding);
				595	if (v == NULL)
				596	goto onError;
				597	Py_DECREF(v);
				598	strncpy(unicode_default_encoding,
				599	encoding,
				600	sizeof(unicode_default_encoding));
				601	return 0;
				602
				603	onError:
				604	return -1;
				605	}
				606
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	607	/* --- UTF-8 Codec -------------------------------------------------------- */
				608
				609	static
				610	char utf8_code_length[256] = {
				611	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				612	illegal prefix. see RFC 2279 for details */
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				616	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				617	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				618	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				619	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				620	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				621	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				622	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				623	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				624	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				625	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				626	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				627	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				628	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				629	};
				630
				631	static
				632	int utf8_decoding_error(const char **source,
				633	Py_UNICODE **dest,
				634	const char *errors,
				635	const char *details)
				636	{
				637	if ((errors == NULL) \|\|
				638	(strcmp(errors,"strict") == 0)) {
				639	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	640	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	641	details);
				642	return -1;
				643	}
				644	else if (strcmp(errors,"ignore") == 0) {
				645	(*source)++;
				646	return 0;
				647	}
				648	else if (strcmp(errors,"replace") == 0) {
				649	(*source)++;
				650	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				651	(*dest)++;
				652	return 0;
				653	}
				654	else {
				655	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	656	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	657	errors);
				658	return -1;
				659	}
				660	}
				661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	662	PyObject PyUnicode_DecodeUTF8(const char s,
				663	int size,
				664	const char *errors)
				665	{
				666	int n;
				667	const char *e;
				668	PyUnicodeObject *unicode;
				669	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	670	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	671
				672	/* Note: size will always be longer than the resulting Unicode
				673	character count */
				674	unicode = _PyUnicode_New(size);
				675	if (!unicode)
				676	return NULL;
				677	if (size == 0)
				678	return (PyObject *)unicode;
				679
				680	/* Unpack UTF-8 encoded data */
				681	p = unicode->str;
				682	e = s + size;
				683
				684	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	685	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686
				687	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	688	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	s++;
				690	continue;
				691	}
				692
				693	n = utf8_code_length[ch];
				694
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	695	if (s + n > e) {
				696	errmsg = "unexpected end of data";
				697	goto utf8Error;
				698	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699
				700	switch (n) {
				701
				702	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	703	errmsg = "unexpected code byte";
				704	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	705	break;
				706
				707	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	708	errmsg = "internal error";
				709	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	710	break;
				711
				712	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	if ((s[1] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	717	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	if (ch < 0x80) {
				719	errmsg = "illegal encoding";
				720	goto utf8Error;
				721	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	722	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	723	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	724	break;
				725
				726	case 3:
				727	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	728	(s[2] & 0xc0) != 0x80) {
				729	errmsg = "invalid data";
				730	goto utf8Error;
				731	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	732	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	733	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				734	errmsg = "illegal encoding";
				735	goto utf8Error;
				736	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	738	*p++ = (Py_UNICODE)ch;
				739	break;
				740
				741	case 4:
				742	if ((s[1] & 0xc0) != 0x80 \|\|
				743	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	(s[3] & 0xc0) != 0x80) {
				745	errmsg = "invalid data";
				746	goto utf8Error;
				747	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	748	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				749	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				750	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	751	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				752	byte encoding */
				753	(ch > 0x10ffff)) { /* maximum value allowed for
				754	UTF-16 */
				755	errmsg = "illegal encoding";
				756	goto utf8Error;
				757	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	758	/* compute and append the two surrogates: */
				759
				760	/* translate from 10000..10FFFF to 0..FFFF */
				761	ch -= 0x10000;
				762
				763	/* high surrogate = top 10 bits added to D800 */
				764	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				765
				766	/* low surrogate = bottom 10 bits added to DC00 */
				767	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	768	break;
				769
				770	default:
				771	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	772	errmsg = "unsupported Unicode code range";
				773	goto utf8Error;
				774	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775	}
				776	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	777	continue;
				778
				779	utf8Error:
				780	if (utf8_decoding_error(&s, &p, errors, errmsg))
				781	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	782	}
				783
				784	/* Adjust length */
				785	if (_PyUnicode_Resize(unicode, p - unicode->str))
				786	goto onError;
				787
				788	return (PyObject *)unicode;
				789
				790	onError:
				791	Py_DECREF(unicode);
				792	return NULL;
				793	}
				794
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	795	/* Not used anymore, now that the encoder supports UTF-16
				796	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	797	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	798	static
				799	int utf8_encoding_error(const Py_UNICODE **source,
				800	char **dest,
				801	const char *errors,
				802	const char *details)
				803	{
				804	if ((errors == NULL) \|\|
				805	(strcmp(errors,"strict") == 0)) {
				806	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	807	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	details);
				809	return -1;
				810	}
				811	else if (strcmp(errors,"ignore") == 0) {
				812	return 0;
				813	}
				814	else if (strcmp(errors,"replace") == 0) {
				815	**dest = '?';
				816	(*dest)++;
				817	return 0;
				818	}
				819	else {
				820	PyErr_Format(PyExc_ValueError,
				821	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	822	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	errors);
				824	return -1;
				825	}
				826	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	827	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	828
				829	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				830	int size,
				831	const char *errors)
				832	{
				833	PyObject *v;
				834	char *p;
				835	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	836	Py_UCS4 ch2;
				837	unsigned int cbAllocated = 3 * size;
				838	unsigned int cbWritten = 0;
				839	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	842	if (v == NULL)
				843	return NULL;
				844	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	845	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	846
				847	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	while (i < size) {
				849	Py_UCS4 ch = s[i++];
				850	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	851	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	852	cbWritten++;
				853	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	854	else if (ch < 0x0800) {
				855	*p++ = 0xc0 \| (ch >> 6);
				856	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	857	cbWritten += 2;
				858	}
				859	else {
				860	/* Check for high surrogate */
				861	if (0xD800 <= ch && ch <= 0xDBFF) {
				862	if (i != size) {
				863	ch2 = s[i];
				864	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				865
				866	if (cbWritten >= (cbAllocated - 4)) {
				867	/* Provide enough room for some more
				868	surrogates */
				869	cbAllocated += 4*10;
				870	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	871	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	872	}
				873
				874	/* combine the two values */
				875	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				876
				877	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	878	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	879	i++;
				880	cbWritten += 4;
				881	}
				882	}
				883	}
				884	else {
				885	*p++ = (char)(0xe0 \| (ch >> 12));
				886	cbWritten += 3;
				887	}
				888	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				889	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	}
				891	}
				892	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	893	if (_PyString_Resize(&v, p - q))
				894	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	895	return v;
				896
				897	onError:
				898	Py_DECREF(v);
				899	return NULL;
				900	}
				901
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	902	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				903	{
				904	PyObject *str;
				905
				906	if (!PyUnicode_Check(unicode)) {
				907	PyErr_BadArgument();
				908	return NULL;
				909	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	910	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				911	PyUnicode_GET_SIZE(unicode),
				912	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	913	}
				914
				915	/* --- UTF-16 Codec ------------------------------------------------------- */
				916
				917	static
				918	int utf16_decoding_error(const Py_UNICODE **source,
				919	Py_UNICODE **dest,
				920	const char *errors,
				921	const char *details)
				922	{
				923	if ((errors == NULL) \|\|
				924	(strcmp(errors,"strict") == 0)) {
				925	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	926	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	927	details);
				928	return -1;
				929	}
				930	else if (strcmp(errors,"ignore") == 0) {
				931	return 0;
				932	}
				933	else if (strcmp(errors,"replace") == 0) {
				934	if (dest) {
				935	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				936	(*dest)++;
				937	}
				938	return 0;
				939	}
				940	else {
				941	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	942	"UTF-16 decoding error; "
				943	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	944	errors);
				945	return -1;
				946	}
				947	}
				948
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	949	PyObject PyUnicode_DecodeUTF16(const char s,
				950	int size,
				951	const char *errors,
				952	int *byteorder)
				953	{
				954	PyUnicodeObject *unicode;
				955	Py_UNICODE *p;
				956	const Py_UNICODE q, e;
				957	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	958	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	959
				960	/* size should be an even number */
				961	if (size % sizeof(Py_UNICODE) != 0) {
				962	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				963	return NULL;
				964	/* The remaining input chars are ignored if we fall through
				965	here... */
				966	}
				967
				968	/* Note: size will always be longer than the resulting Unicode
				969	character count */
				970	unicode = _PyUnicode_New(size);
				971	if (!unicode)
				972	return NULL;
				973	if (size == 0)
				974	return (PyObject *)unicode;
				975
				976	/* Unpack UTF-16 encoded data */
				977	p = unicode->str;
				978	q = (Py_UNICODE *)s;
				979	e = q + (size / sizeof(Py_UNICODE));
				980
				981	if (byteorder)
				982	bo = *byteorder;
				983
				984	while (q < e) {
				985	register Py_UNICODE ch = *q++;
				986
				987	/* Check for BOM marks (U+FEFF) in the input and adjust
				988	current byte order setting accordingly. Swap input
				989	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				990	!) */
				991	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				992	if (ch == 0xFEFF) {
				993	bo = -1;
				994	continue;
				995	} else if (ch == 0xFFFE) {
				996	bo = 1;
				997	continue;
				998	}
				999	if (bo == 1)
				1000	ch = (ch >> 8) \| (ch << 8);
				1001	#else
				1002	if (ch == 0xFEFF) {
				1003	bo = 1;
				1004	continue;
				1005	} else if (ch == 0xFFFE) {
				1006	bo = -1;
				1007	continue;
				1008	}
				1009	if (bo == -1)
				1010	ch = (ch >> 8) \| (ch << 8);
				1011	#endif
				1012	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1013	*p++ = ch;
				1014	continue;
				1015	}
				1016
				1017	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1018	if (q >= e) {
				1019	errmsg = "unexpected end of data";
				1020	goto utf16Error;
				1021	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1022	if (0xDC00 <= q && q <= 0xDFFF) {
				1023	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1024	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1025	/* This is valid data (a UTF-16 surrogate pair), but
				1026	we are not able to store this information since our
				1027	Py_UNICODE type only has 16 bits... this might
				1028	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1029	errmsg = "code pairs are not supported";
				1030	goto utf16Error;
				1031	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1032	else
				1033	continue;
				1034	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1035	errmsg = "illegal encoding";
				1036	/* Fall through to report the error */
				1037
				1038	utf16Error:
				1039	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1040	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1041	}
				1042
				1043	if (byteorder)
				1044	*byteorder = bo;
				1045
				1046	/* Adjust length */
				1047	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1048	goto onError;
				1049
				1050	return (PyObject *)unicode;
				1051
				1052	onError:
				1053	Py_DECREF(unicode);
				1054	return NULL;
				1055	}
				1056
				1057	#undef UTF16_ERROR
				1058
				1059	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1060	int size,
				1061	const char *errors,
				1062	int byteorder)
				1063	{
				1064	PyObject *v;
				1065	Py_UNICODE *p;
				1066	char *q;
				1067
				1068	/* We don't create UTF-16 pairs... */
				1069	v = PyString_FromStringAndSize(NULL,
				1070	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1071	if (v == NULL)
				1072	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1073
				1074	q = PyString_AS_STRING(v);
				1075	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1076	if (byteorder == 0)
				1077	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1078	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1079	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	if (byteorder == 0 \|\|
				1081	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1082	byteorder == -1
				1083	#else
				1084	byteorder == 1
				1085	#endif
				1086	)
				1087	memcpy(p, s, size * sizeof(Py_UNICODE));
				1088	else
				1089	while (size-- > 0) {
				1090	Py_UNICODE ch = *s++;
				1091	*p++ = (ch >> 8) \| (ch << 8);
				1092	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1093	return v;
				1094	}
				1095
				1096	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1097	{
				1098	if (!PyUnicode_Check(unicode)) {
				1099	PyErr_BadArgument();
				1100	return NULL;
				1101	}
				1102	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1103	PyUnicode_GET_SIZE(unicode),
				1104	NULL,
				1105	0);
				1106	}
				1107
				1108	/* --- Unicode Escape Codec ----------------------------------------------- */
				1109
				1110	static
				1111	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1112	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1113	const char *errors,
				1114	const char *details)
				1115	{
				1116	if ((errors == NULL) \|\|
				1117	(strcmp(errors,"strict") == 0)) {
				1118	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1119	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1120	details);
				1121	return -1;
				1122	}
				1123	else if (strcmp(errors,"ignore") == 0) {
				1124	return 0;
				1125	}
				1126	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1127	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	return 0;
				1129	}
				1130	else {
				1131	PyErr_Format(PyExc_ValueError,
				1132	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1133	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1134	errors);
				1135	return -1;
				1136	}
				1137	}
				1138
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1139	static _Py_UCNHashAPI *pucnHash = NULL;
				1140
				1141	static
				1142	int mystrnicmp(const char s1, const char s2, size_t count)
				1143	{
				1144	char c1, c2;
				1145
				1146	if (count)
				1147	{
				1148	do
				1149	{
				1150	c1 = tolower(*(s1++));
				1151	c2 = tolower(*(s2++));
				1152	}
				1153	while(--count && c1 == c2);
				1154
				1155	return c1 - c2;
				1156	}
				1157
				1158	return 0;
				1159	}
				1160
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1162	int size,
				1163	const char *errors)
				1164	{
				1165	PyUnicodeObject *v;
				1166	Py_UNICODE p = NULL, buf = NULL;
				1167	const char *end;
				1168
				1169	/* Escaped strings will always be longer than the resulting
				1170	Unicode string, so we start with size here and then reduce the
				1171	length after conversion to the true value. */
				1172	v = _PyUnicode_New(size);
				1173	if (v == NULL)
				1174	goto onError;
				1175	if (size == 0)
				1176	return (PyObject *)v;
				1177	p = buf = PyUnicode_AS_UNICODE(v);
				1178	end = s + size;
				1179	while (s < end) {
				1180	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1181	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1182	int i;
				1183
				1184	/* Non-escape characters are interpreted as Unicode ordinals */
				1185	if (*s != '\\') {
				1186	p++ = (unsigned char)s++;
				1187	continue;
				1188	}
				1189
				1190	/* \ - Escapes */
				1191	s++;
				1192	switch (*s++) {
				1193
				1194	/* \x escapes */
				1195	case '\n': break;
				1196	case '\\': *p++ = '\\'; break;
				1197	case '\'': *p++ = '\''; break;
				1198	case '\"': *p++ = '\"'; break;
				1199	case 'b': *p++ = '\b'; break;
				1200	case 'f': p++ = '\014'; break; / FF */
				1201	case 't': *p++ = '\t'; break;
				1202	case 'n': *p++ = '\n'; break;
				1203	case 'r': *p++ = '\r'; break;
				1204	case 'v': p++ = '\013'; break; / VT */
				1205	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1206
				1207	/* \OOO (octal) escapes */
				1208	case '0': case '1': case '2': case '3':
				1209	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1210	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1211	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1212	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1214	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1215	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1216	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1217	break;
				1218
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1219	/* \xXXXX escape with 1-n hex digits. for compatibility
				1220	with 8-bit strings, this code ignores all but the last
				1221	two digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1222	case 'x':
				1223	x = 0;
				1224	c = (unsigned char)*s;
				1225	if (isxdigit(c)) {
				1226	do {
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1227	x = (x<<4) & 0xF0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1228	if ('0' <= c && c <= '9')
				1229	x += c - '0';
				1230	else if ('a' <= c && c <= 'f')
				1231	x += 10 + c - 'a';
				1232	else
				1233	x += 10 + c - 'A';
				1234	c = (unsigned char)*++s;
				1235	} while (isxdigit(c));
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1236	*p++ = (unsigned char) x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1237	} else {
				1238	*p++ = '\\';
				1239	*p++ = (unsigned char)s[-1];
				1240	}
				1241	break;
				1242
				1243	/* \uXXXX with 4 hex digits */
				1244	case 'u':
				1245	for (x = 0, i = 0; i < 4; i++) {
				1246	c = (unsigned char)s[i];
				1247	if (!isxdigit(c)) {
				1248	if (unicodeescape_decoding_error(&s, &x, errors,
				1249	"truncated \\uXXXX"))
				1250	goto onError;
				1251	i++;
				1252	break;
				1253	}
				1254	x = (x<<4) & ~0xF;
				1255	if (c >= '0' && c <= '9')
				1256	x += c - '0';
				1257	else if (c >= 'a' && c <= 'f')
				1258	x += 10 + c - 'a';
				1259	else
				1260	x += 10 + c - 'A';
				1261	}
				1262	s += i;
				1263	*p++ = x;
				1264	break;
				1265
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1266	case 'N':
				1267	/* Ok, we need to deal with Unicode Character Names now,
				1268	* make sure we've imported the hash table data...
				1269	*/
				1270	if (pucnHash == NULL)
				1271	{
				1272	PyObject mod = 0, v = 0;
				1273
				1274	mod = PyImport_ImportModule("ucnhash");
				1275	if (mod == NULL)
				1276	goto onError;
				1277	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1278	Py_DECREF(mod);
				1279	if (v == NULL)
				1280	{
				1281	goto onError;
				1282	}
				1283	pucnHash = PyCObject_AsVoidPtr(v);
				1284	Py_DECREF(v);
				1285	if (pucnHash == NULL)
				1286	{
				1287	goto onError;
				1288	}
				1289	}
				1290
				1291	if (*s == '{')
				1292	{
				1293	const char *start = s + 1;
				1294	const char *endBrace = start;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1295	Py_UCS4 value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1296	unsigned long j;
				1297
				1298	/* look for either the closing brace, or we
				1299	* exceed the maximum length of the unicode character names
				1300	*/
				1301	while (*endBrace != '}' &&
				1302	(unsigned int)(endBrace - start) <=
				1303	pucnHash->cchMax &&
				1304	endBrace < end)
				1305	{
				1306	endBrace++;
				1307	}
				1308	if (endBrace != end && *endBrace == '}')
				1309	{
				1310	j = pucnHash->hash(start, endBrace - start);
				1311	if (j > pucnHash->cKeys \|\|
				1312	mystrnicmp(
				1313	start,
				1314	((_Py_UnicodeCharacterName *)
				1315	(pucnHash->getValue(j)))->pszUCN,
				1316	(int)(endBrace - start)) != 0)
				1317	{
				1318	if (unicodeescape_decoding_error(
				1319	&s, &x, errors,
				1320	"Invalid Unicode Character Name"))
				1321	{
				1322	goto onError;
				1323	}
				1324	goto ucnFallthrough;
				1325	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1326	value = ((_Py_UnicodeCharacterName *)
				1327	(pucnHash->getValue(j)))->value;
				1328	if (value < 1<<16)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1329	{
				1330	/* In UCS-2 range, easy solution.. */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1331	*p++ = value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1332	}
				1333	else
				1334	{
				1335	/* Oops, its in UCS-4 space, */
				1336	/* compute and append the two surrogates: */
				1337	/* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1338	value -= 0x10000;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1339
				1340	/* high surrogate = top 10 bits added to D800 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1341	*p++ = 0xD800 + (value >> 10);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1342
				1343	/* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1344	*p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1345	}
				1346	s = endBrace + 1;
				1347	}
				1348	else
				1349	{
				1350	if (unicodeescape_decoding_error(
				1351	&s, &x, errors,
				1352	"Unicode name missing closing brace"))
				1353	goto onError;
				1354	goto ucnFallthrough;
				1355	}
				1356	break;
				1357	}
				1358	if (unicodeescape_decoding_error(
				1359	&s, &x, errors,
				1360	"Missing opening brace for Unicode Character Name escape"))
				1361	goto onError;
				1362	ucnFallthrough:
				1363	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1364	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1365	*p++ = '\\';
				1366	*p++ = (unsigned char)s[-1];
				1367	break;
				1368	}
				1369	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1370	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1371	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1372	return (PyObject *)v;
				1373
				1374	onError:
				1375	Py_XDECREF(v);
				1376	return NULL;
				1377	}
				1378
				1379	/* Return a Unicode-Escape string version of the Unicode object.
				1380
				1381	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1382	appropriate.
				1383
				1384	*/
				1385
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1386	static const Py_UNICODE findchar(const Py_UNICODE s,
				1387	int size,
				1388	Py_UNICODE ch);
				1389
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1390	static
				1391	PyObject unicodeescape_string(const Py_UNICODE s,
				1392	int size,
				1393	int quotes)
				1394	{
				1395	PyObject *repr;
				1396	char *p;
				1397	char *q;
				1398
				1399	static const char *hexdigit = "0123456789ABCDEF";
				1400
				1401	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1402	if (repr == NULL)
				1403	return NULL;
				1404
				1405	p = q = PyString_AS_STRING(repr);
				1406
				1407	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1408	*p++ = 'u';
				1409	*p++ = (findchar(s, size, '\'') &&
				1410	!findchar(s, size, '"')) ? '"' : '\'';
				1411	}
				1412	while (size-- > 0) {
				1413	Py_UNICODE ch = *s++;
				1414	/* Escape quotes */
				1415	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1416	*p++ = '\\';
				1417	*p++ = (char) ch;
				1418	}
				1419	/* Map 16-bit characters to '\uxxxx' */
				1420	else if (ch >= 256) {
				1421	*p++ = '\\';
				1422	*p++ = 'u';
				1423	*p++ = hexdigit[(ch >> 12) & 0xf];
				1424	*p++ = hexdigit[(ch >> 8) & 0xf];
				1425	*p++ = hexdigit[(ch >> 4) & 0xf];
				1426	*p++ = hexdigit[ch & 15];
				1427	}
				1428	/* Map non-printable US ASCII to '\ooo' */
				1429	else if (ch < ' ' \|\| ch >= 128) {
				1430	*p++ = '\\';
				1431	*p++ = hexdigit[(ch >> 6) & 7];
				1432	*p++ = hexdigit[(ch >> 3) & 7];
				1433	*p++ = hexdigit[ch & 7];
				1434	}
				1435	/* Copy everything else as-is */
				1436	else
				1437	*p++ = (char) ch;
				1438	}
				1439	if (quotes)
				1440	*p++ = q[1];
				1441
				1442	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1443	if (_PyString_Resize(&repr, p - q))
				1444	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1445
				1446	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1447
				1448	onError:
				1449	Py_DECREF(repr);
				1450	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1451	}
				1452
				1453	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1454	int size)
				1455	{
				1456	return unicodeescape_string(s, size, 0);
				1457	}
				1458
				1459	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1460	{
				1461	if (!PyUnicode_Check(unicode)) {
				1462	PyErr_BadArgument();
				1463	return NULL;
				1464	}
				1465	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1466	PyUnicode_GET_SIZE(unicode));
				1467	}
				1468
				1469	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1470
				1471	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1472	int size,
				1473	const char *errors)
				1474	{
				1475	PyUnicodeObject *v;
				1476	Py_UNICODE p, buf;
				1477	const char *end;
				1478	const char *bs;
				1479
				1480	/* Escaped strings will always be longer than the resulting
				1481	Unicode string, so we start with size here and then reduce the
				1482	length after conversion to the true value. */
				1483	v = _PyUnicode_New(size);
				1484	if (v == NULL)
				1485	goto onError;
				1486	if (size == 0)
				1487	return (PyObject *)v;
				1488	p = buf = PyUnicode_AS_UNICODE(v);
				1489	end = s + size;
				1490	while (s < end) {
				1491	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1492	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1493	int i;
				1494
				1495	/* Non-escape characters are interpreted as Unicode ordinals */
				1496	if (*s != '\\') {
				1497	p++ = (unsigned char)s++;
				1498	continue;
				1499	}
				1500
				1501	/* \u-escapes are only interpreted iff the number of leading
				1502	backslashes if odd */
				1503	bs = s;
				1504	for (;s < end;) {
				1505	if (*s != '\\')
				1506	break;
				1507	p++ = (unsigned char)s++;
				1508	}
				1509	if (((s - bs) & 1) == 0 \|\|
				1510	s >= end \|\|
				1511	*s != 'u') {
				1512	continue;
				1513	}
				1514	p--;
				1515	s++;
				1516
				1517	/* \uXXXX with 4 hex digits */
				1518	for (x = 0, i = 0; i < 4; i++) {
				1519	c = (unsigned char)s[i];
				1520	if (!isxdigit(c)) {
				1521	if (unicodeescape_decoding_error(&s, &x, errors,
				1522	"truncated \\uXXXX"))
				1523	goto onError;
				1524	i++;
				1525	break;
				1526	}
				1527	x = (x<<4) & ~0xF;
				1528	if (c >= '0' && c <= '9')
				1529	x += c - '0';
				1530	else if (c >= 'a' && c <= 'f')
				1531	x += 10 + c - 'a';
				1532	else
				1533	x += 10 + c - 'A';
				1534	}
				1535	s += i;
				1536	*p++ = x;
				1537	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1538	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1539	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1540	return (PyObject *)v;
				1541
				1542	onError:
				1543	Py_XDECREF(v);
				1544	return NULL;
				1545	}
				1546
				1547	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1548	int size)
				1549	{
				1550	PyObject *repr;
				1551	char *p;
				1552	char *q;
				1553
				1554	static const char *hexdigit = "0123456789ABCDEF";
				1555
				1556	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1557	if (repr == NULL)
				1558	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1559	if (size == 0)
				1560	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1561
				1562	p = q = PyString_AS_STRING(repr);
				1563	while (size-- > 0) {
				1564	Py_UNICODE ch = *s++;
				1565	/* Map 16-bit characters to '\uxxxx' */
				1566	if (ch >= 256) {
				1567	*p++ = '\\';
				1568	*p++ = 'u';
				1569	*p++ = hexdigit[(ch >> 12) & 0xf];
				1570	*p++ = hexdigit[(ch >> 8) & 0xf];
				1571	*p++ = hexdigit[(ch >> 4) & 0xf];
				1572	*p++ = hexdigit[ch & 15];
				1573	}
				1574	/* Copy everything else as-is */
				1575	else
				1576	*p++ = (char) ch;
				1577	}
				1578	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1579	if (_PyString_Resize(&repr, p - q))
				1580	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1581
				1582	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1583
				1584	onError:
				1585	Py_DECREF(repr);
				1586	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1587	}
				1588
				1589	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1590	{
				1591	if (!PyUnicode_Check(unicode)) {
				1592	PyErr_BadArgument();
				1593	return NULL;
				1594	}
				1595	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1596	PyUnicode_GET_SIZE(unicode));
				1597	}
				1598
				1599	/* --- Latin-1 Codec ------------------------------------------------------ */
				1600
				1601	PyObject PyUnicode_DecodeLatin1(const char s,
				1602	int size,
				1603	const char *errors)
				1604	{
				1605	PyUnicodeObject *v;
				1606	Py_UNICODE *p;
				1607
				1608	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1609	v = _PyUnicode_New(size);
				1610	if (v == NULL)
				1611	goto onError;
				1612	if (size == 0)
				1613	return (PyObject *)v;
				1614	p = PyUnicode_AS_UNICODE(v);
				1615	while (size-- > 0)
				1616	p++ = (unsigned char)s++;
				1617	return (PyObject *)v;
				1618
				1619	onError:
				1620	Py_XDECREF(v);
				1621	return NULL;
				1622	}
				1623
				1624	static
				1625	int latin1_encoding_error(const Py_UNICODE **source,
				1626	char **dest,
				1627	const char *errors,
				1628	const char *details)
				1629	{
				1630	if ((errors == NULL) \|\|
				1631	(strcmp(errors,"strict") == 0)) {
				1632	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1633	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1634	details);
				1635	return -1;
				1636	}
				1637	else if (strcmp(errors,"ignore") == 0) {
				1638	return 0;
				1639	}
				1640	else if (strcmp(errors,"replace") == 0) {
				1641	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1642	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1643	return 0;
				1644	}
				1645	else {
				1646	PyErr_Format(PyExc_ValueError,
				1647	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1648	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1649	errors);
				1650	return -1;
				1651	}
				1652	}
				1653
				1654	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1655	int size,
				1656	const char *errors)
				1657	{
				1658	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1659	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1660
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1661	repr = PyString_FromStringAndSize(NULL, size);
				1662	if (repr == NULL)
				1663	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1664	if (size == 0)
				1665	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1666
				1667	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1668	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1669	while (size-- > 0) {
				1670	Py_UNICODE ch = *p++;
				1671	if (ch >= 256) {
				1672	if (latin1_encoding_error(&p, &s, errors,
				1673	"ordinal not in range(256)"))
				1674	goto onError;
				1675	}
				1676	else
				1677	*s++ = (char)ch;
				1678	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1679	/* Resize if error handling skipped some characters */
				1680	if (s - start < PyString_GET_SIZE(repr))
				1681	if (_PyString_Resize(&repr, s - start))
				1682	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1683	return repr;
				1684
				1685	onError:
				1686	Py_DECREF(repr);
				1687	return NULL;
				1688	}
				1689
				1690	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1691	{
				1692	if (!PyUnicode_Check(unicode)) {
				1693	PyErr_BadArgument();
				1694	return NULL;
				1695	}
				1696	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1697	PyUnicode_GET_SIZE(unicode),
				1698	NULL);
				1699	}
				1700
				1701	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1702
				1703	static
				1704	int ascii_decoding_error(const char **source,
				1705	Py_UNICODE **dest,
				1706	const char *errors,
				1707	const char *details)
				1708	{
				1709	if ((errors == NULL) \|\|
				1710	(strcmp(errors,"strict") == 0)) {
				1711	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1712	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1713	details);
				1714	return -1;
				1715	}
				1716	else if (strcmp(errors,"ignore") == 0) {
				1717	return 0;
				1718	}
				1719	else if (strcmp(errors,"replace") == 0) {
				1720	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1721	(*dest)++;
				1722	return 0;
				1723	}
				1724	else {
				1725	PyErr_Format(PyExc_ValueError,
				1726	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1727	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1728	errors);
				1729	return -1;
				1730	}
				1731	}
				1732
				1733	PyObject PyUnicode_DecodeASCII(const char s,
				1734	int size,
				1735	const char *errors)
				1736	{
				1737	PyUnicodeObject *v;
				1738	Py_UNICODE *p;
				1739
				1740	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1741	v = _PyUnicode_New(size);
				1742	if (v == NULL)
				1743	goto onError;
				1744	if (size == 0)
				1745	return (PyObject *)v;
				1746	p = PyUnicode_AS_UNICODE(v);
				1747	while (size-- > 0) {
				1748	register unsigned char c;
				1749
				1750	c = (unsigned char)*s++;
				1751	if (c < 128)
				1752	*p++ = c;
				1753	else if (ascii_decoding_error(&s, &p, errors,
				1754	"ordinal not in range(128)"))
				1755	goto onError;
				1756	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1757	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1758	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1759	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1760	return (PyObject *)v;
				1761
				1762	onError:
				1763	Py_XDECREF(v);
				1764	return NULL;
				1765	}
				1766
				1767	static
				1768	int ascii_encoding_error(const Py_UNICODE **source,
				1769	char **dest,
				1770	const char *errors,
				1771	const char *details)
				1772	{
				1773	if ((errors == NULL) \|\|
				1774	(strcmp(errors,"strict") == 0)) {
				1775	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1776	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1777	details);
				1778	return -1;
				1779	}
				1780	else if (strcmp(errors,"ignore") == 0) {
				1781	return 0;
				1782	}
				1783	else if (strcmp(errors,"replace") == 0) {
				1784	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1785	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1786	return 0;
				1787	}
				1788	else {
				1789	PyErr_Format(PyExc_ValueError,
				1790	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1791	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1792	errors);
				1793	return -1;
				1794	}
				1795	}
				1796
				1797	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1798	int size,
				1799	const char *errors)
				1800	{
				1801	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1802	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1803
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1804	repr = PyString_FromStringAndSize(NULL, size);
				1805	if (repr == NULL)
				1806	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1807	if (size == 0)
				1808	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1809
				1810	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1811	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1812	while (size-- > 0) {
				1813	Py_UNICODE ch = *p++;
				1814	if (ch >= 128) {
				1815	if (ascii_encoding_error(&p, &s, errors,
				1816	"ordinal not in range(128)"))
				1817	goto onError;
				1818	}
				1819	else
				1820	*s++ = (char)ch;
				1821	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1822	/* Resize if error handling skipped some characters */
				1823	if (s - start < PyString_GET_SIZE(repr))
				1824	if (_PyString_Resize(&repr, s - start))
				1825	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1826	return repr;
				1827
				1828	onError:
				1829	Py_DECREF(repr);
				1830	return NULL;
				1831	}
				1832
				1833	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1834	{
				1835	if (!PyUnicode_Check(unicode)) {
				1836	PyErr_BadArgument();
				1837	return NULL;
				1838	}
				1839	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1840	PyUnicode_GET_SIZE(unicode),
				1841	NULL);
				1842	}
				1843
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1844	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1845
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1846	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1847
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1848	PyObject PyUnicode_DecodeMBCS(const char s,
				1849	int size,
				1850	const char *errors)
				1851	{
				1852	PyUnicodeObject *v;
				1853	Py_UNICODE *p;
				1854
				1855	/* First get the size of the result */
				1856	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1857	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1858	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1859
				1860	v = _PyUnicode_New(usize);
				1861	if (v == NULL)
				1862	return NULL;
				1863	if (usize == 0)
				1864	return (PyObject *)v;
				1865	p = PyUnicode_AS_UNICODE(v);
				1866	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1867	Py_DECREF(v);
				1868	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1869	}
				1870
				1871	return (PyObject *)v;
				1872	}
				1873
				1874	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1875	int size,
				1876	const char *errors)
				1877	{
				1878	PyObject *repr;
				1879	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1880	DWORD mbcssize;
				1881
				1882	/* If there are no characters, bail now! */
				1883	if (size==0)
				1884	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1885
				1886	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1887	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1888	if (mbcssize==0)
				1889	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1890
				1891	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1892	if (repr == NULL)
				1893	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1894	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1895	return repr;
				1896
				1897	/* Do the conversion */
				1898	s = PyString_AS_STRING(repr);
				1899	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1900	Py_DECREF(repr);
				1901	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1902	}
				1903	return repr;
				1904	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1905
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1906	#endif /* MS_WIN32 */
				1907
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1908	/* --- Character Mapping Codec -------------------------------------------- */
				1909
				1910	static
				1911	int charmap_decoding_error(const char **source,
				1912	Py_UNICODE **dest,
				1913	const char *errors,
				1914	const char *details)
				1915	{
				1916	if ((errors == NULL) \|\|
				1917	(strcmp(errors,"strict") == 0)) {
				1918	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1919	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1920	details);
				1921	return -1;
				1922	}
				1923	else if (strcmp(errors,"ignore") == 0) {
				1924	return 0;
				1925	}
				1926	else if (strcmp(errors,"replace") == 0) {
				1927	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1928	(*dest)++;
				1929	return 0;
				1930	}
				1931	else {
				1932	PyErr_Format(PyExc_ValueError,
				1933	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1934	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1935	errors);
				1936	return -1;
				1937	}
				1938	}
				1939
				1940	PyObject PyUnicode_DecodeCharmap(const char s,
				1941	int size,
				1942	PyObject *mapping,
				1943	const char *errors)
				1944	{
				1945	PyUnicodeObject *v;
				1946	Py_UNICODE *p;
				1947
				1948	/* Default to Latin-1 */
				1949	if (mapping == NULL)
				1950	return PyUnicode_DecodeLatin1(s, size, errors);
				1951
				1952	v = _PyUnicode_New(size);
				1953	if (v == NULL)
				1954	goto onError;
				1955	if (size == 0)
				1956	return (PyObject *)v;
				1957	p = PyUnicode_AS_UNICODE(v);
				1958	while (size-- > 0) {
				1959	unsigned char ch = *s++;
				1960	PyObject w, x;
				1961
				1962	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1963	w = PyInt_FromLong((long)ch);
				1964	if (w == NULL)
				1965	goto onError;
				1966	x = PyObject_GetItem(mapping, w);
				1967	Py_DECREF(w);
				1968	if (x == NULL) {
				1969	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1970	/* No mapping found: default to Latin-1 mapping */
				1971	PyErr_Clear();
				1972	*p++ = (Py_UNICODE)ch;
				1973	continue;
				1974	}
				1975	goto onError;
				1976	}
				1977
				1978	/* Apply mapping */
				1979	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1980	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1981	if (value < 0 \|\| value > 65535) {
				1982	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1983	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1984	Py_DECREF(x);
				1985	goto onError;
				1986	}
				1987	*p++ = (Py_UNICODE)value;
				1988	}
				1989	else if (x == Py_None) {
				1990	/* undefined mapping */
				1991	if (charmap_decoding_error(&s, &p, errors,
				1992	"character maps to <undefined>")) {
				1993	Py_DECREF(x);
				1994	goto onError;
				1995	}
				1996	}
				1997	else if (PyUnicode_Check(x)) {
				1998	if (PyUnicode_GET_SIZE(x) != 1) {
				1999	/* 1-n mapping */
				2000	PyErr_SetString(PyExc_NotImplementedError,
				2001	"1-n mappings are currently not implemented");
				2002	Py_DECREF(x);
				2003	goto onError;
				2004	}
				2005	p++ = PyUnicode_AS_UNICODE(x);
				2006	}
				2007	else {
				2008	/* wrong return value */
				2009	PyErr_SetString(PyExc_TypeError,
				2010	"character mapping must return integer, None or unicode");
				2011	Py_DECREF(x);
				2012	goto onError;
				2013	}
				2014	Py_DECREF(x);
				2015	}
				2016	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2017	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2018	goto onError;
				2019	return (PyObject *)v;
				2020
				2021	onError:
				2022	Py_XDECREF(v);
				2023	return NULL;
				2024	}
				2025
				2026	static
				2027	int charmap_encoding_error(const Py_UNICODE **source,
				2028	char **dest,
				2029	const char *errors,
				2030	const char *details)
				2031	{
				2032	if ((errors == NULL) \|\|
				2033	(strcmp(errors,"strict") == 0)) {
				2034	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2035	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2036	details);
				2037	return -1;
				2038	}
				2039	else if (strcmp(errors,"ignore") == 0) {
				2040	return 0;
				2041	}
				2042	else if (strcmp(errors,"replace") == 0) {
				2043	**dest = '?';
				2044	(*dest)++;
				2045	return 0;
				2046	}
				2047	else {
				2048	PyErr_Format(PyExc_ValueError,
				2049	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2050	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2051	errors);
				2052	return -1;
				2053	}
				2054	}
				2055
				2056	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2057	int size,
				2058	PyObject *mapping,
				2059	const char *errors)
				2060	{
				2061	PyObject *v;
				2062	char *s;
				2063
				2064	/* Default to Latin-1 */
				2065	if (mapping == NULL)
				2066	return PyUnicode_EncodeLatin1(p, size, errors);
				2067
				2068	v = PyString_FromStringAndSize(NULL, size);
				2069	if (v == NULL)
				2070	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2071	if (size == 0)
				2072	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2073	s = PyString_AS_STRING(v);
				2074	while (size-- > 0) {
				2075	Py_UNICODE ch = *p++;
				2076	PyObject w, x;
				2077
				2078	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2079	w = PyInt_FromLong((long)ch);
				2080	if (w == NULL)
				2081	goto onError;
				2082	x = PyObject_GetItem(mapping, w);
				2083	Py_DECREF(w);
				2084	if (x == NULL) {
				2085	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2086	/* No mapping found: default to Latin-1 mapping if possible */
				2087	PyErr_Clear();
				2088	if (ch < 256) {
				2089	*s++ = (char)ch;
				2090	continue;
				2091	}
				2092	else if (!charmap_encoding_error(&p, &s, errors,
				2093	"missing character mapping"))
				2094	continue;
				2095	}
				2096	goto onError;
				2097	}
				2098
				2099	/* Apply mapping */
				2100	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2101	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2102	if (value < 0 \|\| value > 255) {
				2103	PyErr_SetString(PyExc_TypeError,
				2104	"character mapping must be in range(256)");
				2105	Py_DECREF(x);
				2106	goto onError;
				2107	}
				2108	*s++ = (char)value;
				2109	}
				2110	else if (x == Py_None) {
				2111	/* undefined mapping */
				2112	if (charmap_encoding_error(&p, &s, errors,
				2113	"character maps to <undefined>")) {
				2114	Py_DECREF(x);
				2115	goto onError;
				2116	}
				2117	}
				2118	else if (PyString_Check(x)) {
				2119	if (PyString_GET_SIZE(x) != 1) {
				2120	/* 1-n mapping */
				2121	PyErr_SetString(PyExc_NotImplementedError,
				2122	"1-n mappings are currently not implemented");
				2123	Py_DECREF(x);
				2124	goto onError;
				2125	}
				2126	s++ = PyString_AS_STRING(x);
				2127	}
				2128	else {
				2129	/* wrong return value */
				2130	PyErr_SetString(PyExc_TypeError,
				2131	"character mapping must return integer, None or unicode");
				2132	Py_DECREF(x);
				2133	goto onError;
				2134	}
				2135	Py_DECREF(x);
				2136	}
				2137	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2138	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2139	goto onError;
				2140	return v;
				2141
				2142	onError:
				2143	Py_DECREF(v);
				2144	return NULL;
				2145	}
				2146
				2147	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2148	PyObject *mapping)
				2149	{
				2150	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2151	PyErr_BadArgument();
				2152	return NULL;
				2153	}
				2154	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2155	PyUnicode_GET_SIZE(unicode),
				2156	mapping,
				2157	NULL);
				2158	}
				2159
				2160	static
				2161	int translate_error(const Py_UNICODE **source,
				2162	Py_UNICODE **dest,
				2163	const char *errors,
				2164	const char *details)
				2165	{
				2166	if ((errors == NULL) \|\|
				2167	(strcmp(errors,"strict") == 0)) {
				2168	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2169	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2170	details);
				2171	return -1;
				2172	}
				2173	else if (strcmp(errors,"ignore") == 0) {
				2174	return 0;
				2175	}
				2176	else if (strcmp(errors,"replace") == 0) {
				2177	**dest = '?';
				2178	(*dest)++;
				2179	return 0;
				2180	}
				2181	else {
				2182	PyErr_Format(PyExc_ValueError,
				2183	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2184	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2185	errors);
				2186	return -1;
				2187	}
				2188	}
				2189
				2190	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2191	int size,
				2192	PyObject *mapping,
				2193	const char *errors)
				2194	{
				2195	PyUnicodeObject *v;
				2196	Py_UNICODE *p;
				2197
				2198	if (mapping == NULL) {
				2199	PyErr_BadArgument();
				2200	return NULL;
				2201	}
				2202
				2203	/* Output will never be longer than input */
				2204	v = _PyUnicode_New(size);
				2205	if (v == NULL)
				2206	goto onError;
				2207	if (size == 0)
				2208	goto done;
				2209	p = PyUnicode_AS_UNICODE(v);
				2210	while (size-- > 0) {
				2211	Py_UNICODE ch = *s++;
				2212	PyObject w, x;
				2213
				2214	/* Get mapping */
				2215	w = PyInt_FromLong(ch);
				2216	if (w == NULL)
				2217	goto onError;
				2218	x = PyObject_GetItem(mapping, w);
				2219	Py_DECREF(w);
				2220	if (x == NULL) {
				2221	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2222	/* No mapping found: default to 1-1 mapping */
				2223	PyErr_Clear();
				2224	*p++ = ch;
				2225	continue;
				2226	}
				2227	goto onError;
				2228	}
				2229
				2230	/* Apply mapping */
				2231	if (PyInt_Check(x))
				2232	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2233	else if (x == Py_None) {
				2234	/* undefined mapping */
				2235	if (translate_error(&s, &p, errors,
				2236	"character maps to <undefined>")) {
				2237	Py_DECREF(x);
				2238	goto onError;
				2239	}
				2240	}
				2241	else if (PyUnicode_Check(x)) {
				2242	if (PyUnicode_GET_SIZE(x) != 1) {
				2243	/* 1-n mapping */
				2244	PyErr_SetString(PyExc_NotImplementedError,
				2245	"1-n mappings are currently not implemented");
				2246	Py_DECREF(x);
				2247	goto onError;
				2248	}
				2249	p++ = PyUnicode_AS_UNICODE(x);
				2250	}
				2251	else {
				2252	/* wrong return value */
				2253	PyErr_SetString(PyExc_TypeError,
				2254	"translate mapping must return integer, None or unicode");
				2255	Py_DECREF(x);
				2256	goto onError;
				2257	}
				2258	Py_DECREF(x);
				2259	}
				2260	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2261	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2262	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2263
				2264	done:
				2265	return (PyObject *)v;
				2266
				2267	onError:
				2268	Py_XDECREF(v);
				2269	return NULL;
				2270	}
				2271
				2272	PyObject PyUnicode_Translate(PyObject str,
				2273	PyObject *mapping,
				2274	const char *errors)
				2275	{
				2276	PyObject *result;
				2277
				2278	str = PyUnicode_FromObject(str);
				2279	if (str == NULL)
				2280	goto onError;
				2281	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2282	PyUnicode_GET_SIZE(str),
				2283	mapping,
				2284	errors);
				2285	Py_DECREF(str);
				2286	return result;
				2287
				2288	onError:
				2289	Py_XDECREF(str);
				2290	return NULL;
				2291	}
				2292
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2293	/* --- Decimal Encoder ---------------------------------------------------- */
				2294
				2295	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2296	int length,
				2297	char *output,
				2298	const char *errors)
				2299	{
				2300	Py_UNICODE p, end;
				2301
				2302	if (output == NULL) {
				2303	PyErr_BadArgument();
				2304	return -1;
				2305	}
				2306
				2307	p = s;
				2308	end = s + length;
				2309	while (p < end) {
				2310	register Py_UNICODE ch = *p++;
				2311	int decimal;
				2312
				2313	if (Py_UNICODE_ISSPACE(ch)) {
				2314	*output++ = ' ';
				2315	continue;
				2316	}
				2317	decimal = Py_UNICODE_TODECIMAL(ch);
				2318	if (decimal >= 0) {
				2319	*output++ = '0' + decimal;
				2320	continue;
				2321	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2322	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2323	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2324	continue;
				2325	}
				2326	/* All other characters are considered invalid */
				2327	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2328	PyErr_SetString(PyExc_ValueError,
				2329	"invalid decimal Unicode string");
				2330	goto onError;
				2331	}
				2332	else if (strcmp(errors, "ignore") == 0)
				2333	continue;
				2334	else if (strcmp(errors, "replace") == 0) {
				2335	*output++ = '?';
				2336	continue;
				2337	}
				2338	}
				2339	/* 0-terminate the output string */
				2340	*output++ = '\0';
				2341	return 0;
				2342
				2343	onError:
				2344	return -1;
				2345	}
				2346
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2347	/* --- Helpers ------------------------------------------------------------ */
				2348
				2349	static
				2350	int count(PyUnicodeObject *self,
				2351	int start,
				2352	int end,
				2353	PyUnicodeObject *substring)
				2354	{
				2355	int count = 0;
				2356
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2357	if (substring->length == 0)
				2358	return (end - start + 1);
				2359
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2360	end -= substring->length;
				2361
				2362	while (start <= end)
				2363	if (Py_UNICODE_MATCH(self, start, substring)) {
				2364	count++;
				2365	start += substring->length;
				2366	} else
				2367	start++;
				2368
				2369	return count;
				2370	}
				2371
				2372	int PyUnicode_Count(PyObject *str,
				2373	PyObject *substr,
				2374	int start,
				2375	int end)
				2376	{
				2377	int result;
				2378
				2379	str = PyUnicode_FromObject(str);
				2380	if (str == NULL)
				2381	return -1;
				2382	substr = PyUnicode_FromObject(substr);
				2383	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2384	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2385	return -1;
				2386	}
				2387
				2388	result = count((PyUnicodeObject *)str,
				2389	start, end,
				2390	(PyUnicodeObject *)substr);
				2391
				2392	Py_DECREF(str);
				2393	Py_DECREF(substr);
				2394	return result;
				2395	}
				2396
				2397	static
				2398	int findstring(PyUnicodeObject *self,
				2399	PyUnicodeObject *substring,
				2400	int start,
				2401	int end,
				2402	int direction)
				2403	{
				2404	if (start < 0)
				2405	start += self->length;
				2406	if (start < 0)
				2407	start = 0;
				2408
				2409	if (substring->length == 0)
				2410	return start;
				2411
				2412	if (end > self->length)
				2413	end = self->length;
				2414	if (end < 0)
				2415	end += self->length;
				2416	if (end < 0)
				2417	end = 0;
				2418
				2419	end -= substring->length;
				2420
				2421	if (direction < 0) {
				2422	for (; end >= start; end--)
				2423	if (Py_UNICODE_MATCH(self, end, substring))
				2424	return end;
				2425	} else {
				2426	for (; start <= end; start++)
				2427	if (Py_UNICODE_MATCH(self, start, substring))
				2428	return start;
				2429	}
				2430
				2431	return -1;
				2432	}
				2433
				2434	int PyUnicode_Find(PyObject *str,
				2435	PyObject *substr,
				2436	int start,
				2437	int end,
				2438	int direction)
				2439	{
				2440	int result;
				2441
				2442	str = PyUnicode_FromObject(str);
				2443	if (str == NULL)
				2444	return -1;
				2445	substr = PyUnicode_FromObject(substr);
				2446	if (substr == NULL) {
				2447	Py_DECREF(substr);
				2448	return -1;
				2449	}
				2450
				2451	result = findstring((PyUnicodeObject *)str,
				2452	(PyUnicodeObject *)substr,
				2453	start, end, direction);
				2454	Py_DECREF(str);
				2455	Py_DECREF(substr);
				2456	return result;
				2457	}
				2458
				2459	static
				2460	int tailmatch(PyUnicodeObject *self,
				2461	PyUnicodeObject *substring,
				2462	int start,
				2463	int end,
				2464	int direction)
				2465	{
				2466	if (start < 0)
				2467	start += self->length;
				2468	if (start < 0)
				2469	start = 0;
				2470
				2471	if (substring->length == 0)
				2472	return 1;
				2473
				2474	if (end > self->length)
				2475	end = self->length;
				2476	if (end < 0)
				2477	end += self->length;
				2478	if (end < 0)
				2479	end = 0;
				2480
				2481	end -= substring->length;
				2482	if (end < start)
				2483	return 0;
				2484
				2485	if (direction > 0) {
				2486	if (Py_UNICODE_MATCH(self, end, substring))
				2487	return 1;
				2488	} else {
				2489	if (Py_UNICODE_MATCH(self, start, substring))
				2490	return 1;
				2491	}
				2492
				2493	return 0;
				2494	}
				2495
				2496	int PyUnicode_Tailmatch(PyObject *str,
				2497	PyObject *substr,
				2498	int start,
				2499	int end,
				2500	int direction)
				2501	{
				2502	int result;
				2503
				2504	str = PyUnicode_FromObject(str);
				2505	if (str == NULL)
				2506	return -1;
				2507	substr = PyUnicode_FromObject(substr);
				2508	if (substr == NULL) {
				2509	Py_DECREF(substr);
				2510	return -1;
				2511	}
				2512
				2513	result = tailmatch((PyUnicodeObject *)str,
				2514	(PyUnicodeObject *)substr,
				2515	start, end, direction);
				2516	Py_DECREF(str);
				2517	Py_DECREF(substr);
				2518	return result;
				2519	}
				2520
				2521	static
				2522	const Py_UNICODE findchar(const Py_UNICODE s,
				2523	int size,
				2524	Py_UNICODE ch)
				2525	{
				2526	/* like wcschr, but doesn't stop at NULL characters */
				2527
				2528	while (size-- > 0) {
				2529	if (*s == ch)
				2530	return s;
				2531	s++;
				2532	}
				2533
				2534	return NULL;
				2535	}
				2536
				2537	/* Apply fixfct filter to the Unicode object self and return a
				2538	reference to the modified object */
				2539
				2540	static
				2541	PyObject fixup(PyUnicodeObject self,
				2542	int (fixfct)(PyUnicodeObject s))
				2543	{
				2544
				2545	PyUnicodeObject *u;
				2546
				2547	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2548	self->length);
				2549	if (u == NULL)
				2550	return NULL;
				2551	if (!fixfct(u)) {
				2552	/* fixfct should return TRUE if it modified the buffer. If
				2553	FALSE, return a reference to the original buffer instead
				2554	(to save space, not time) */
				2555	Py_INCREF(self);
				2556	Py_DECREF(u);
				2557	return (PyObject*) self;
				2558	}
				2559	return (PyObject*) u;
				2560	}
				2561
				2562	static
				2563	int fixupper(PyUnicodeObject *self)
				2564	{
				2565	int len = self->length;
				2566	Py_UNICODE *s = self->str;
				2567	int status = 0;
				2568
				2569	while (len-- > 0) {
				2570	register Py_UNICODE ch;
				2571
				2572	ch = Py_UNICODE_TOUPPER(*s);
				2573	if (ch != *s) {
				2574	status = 1;
				2575	*s = ch;
				2576	}
				2577	s++;
				2578	}
				2579
				2580	return status;
				2581	}
				2582
				2583	static
				2584	int fixlower(PyUnicodeObject *self)
				2585	{
				2586	int len = self->length;
				2587	Py_UNICODE *s = self->str;
				2588	int status = 0;
				2589
				2590	while (len-- > 0) {
				2591	register Py_UNICODE ch;
				2592
				2593	ch = Py_UNICODE_TOLOWER(*s);
				2594	if (ch != *s) {
				2595	status = 1;
				2596	*s = ch;
				2597	}
				2598	s++;
				2599	}
				2600
				2601	return status;
				2602	}
				2603
				2604	static
				2605	int fixswapcase(PyUnicodeObject *self)
				2606	{
				2607	int len = self->length;
				2608	Py_UNICODE *s = self->str;
				2609	int status = 0;
				2610
				2611	while (len-- > 0) {
				2612	if (Py_UNICODE_ISUPPER(*s)) {
				2613	s = Py_UNICODE_TOLOWER(s);
				2614	status = 1;
				2615	} else if (Py_UNICODE_ISLOWER(*s)) {
				2616	s = Py_UNICODE_TOUPPER(s);
				2617	status = 1;
				2618	}
				2619	s++;
				2620	}
				2621
				2622	return status;
				2623	}
				2624
				2625	static
				2626	int fixcapitalize(PyUnicodeObject *self)
				2627	{
				2628	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2629	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2630	return 1;
				2631	}
				2632	return 0;
				2633	}
				2634
				2635	static
				2636	int fixtitle(PyUnicodeObject *self)
				2637	{
				2638	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2639	register Py_UNICODE *e;
				2640	int previous_is_cased;
				2641
				2642	/* Shortcut for single character strings */
				2643	if (PyUnicode_GET_SIZE(self) == 1) {
				2644	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2645	if (*p != ch) {
				2646	*p = ch;
				2647	return 1;
				2648	}
				2649	else
				2650	return 0;
				2651	}
				2652
				2653	e = p + PyUnicode_GET_SIZE(self);
				2654	previous_is_cased = 0;
				2655	for (; p < e; p++) {
				2656	register const Py_UNICODE ch = *p;
				2657
				2658	if (previous_is_cased)
				2659	*p = Py_UNICODE_TOLOWER(ch);
				2660	else
				2661	*p = Py_UNICODE_TOTITLE(ch);
				2662
				2663	if (Py_UNICODE_ISLOWER(ch) \|\|
				2664	Py_UNICODE_ISUPPER(ch) \|\|
				2665	Py_UNICODE_ISTITLE(ch))
				2666	previous_is_cased = 1;
				2667	else
				2668	previous_is_cased = 0;
				2669	}
				2670	return 1;
				2671	}
				2672
				2673	PyObject PyUnicode_Join(PyObject separator,
				2674	PyObject *seq)
				2675	{
				2676	Py_UNICODE *sep;
				2677	int seplen;
				2678	PyUnicodeObject *res = NULL;
				2679	int reslen = 0;
				2680	Py_UNICODE *p;
				2681	int seqlen = 0;
				2682	int sz = 100;
				2683	int i;
				2684
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2685	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2686	if (seqlen < 0 && PyErr_Occurred())
				2687	return NULL;
				2688
				2689	if (separator == NULL) {
				2690	Py_UNICODE blank = ' ';
				2691	sep = &blank;
				2692	seplen = 1;
				2693	}
				2694	else {
				2695	separator = PyUnicode_FromObject(separator);
				2696	if (separator == NULL)
				2697	return NULL;
				2698	sep = PyUnicode_AS_UNICODE(separator);
				2699	seplen = PyUnicode_GET_SIZE(separator);
				2700	}
				2701
				2702	res = _PyUnicode_New(sz);
				2703	if (res == NULL)
				2704	goto onError;
				2705	p = PyUnicode_AS_UNICODE(res);
				2706	reslen = 0;
				2707
				2708	for (i = 0; i < seqlen; i++) {
				2709	int itemlen;
				2710	PyObject *item;
				2711
				2712	item = PySequence_GetItem(seq, i);
				2713	if (item == NULL)
				2714	goto onError;
				2715	if (!PyUnicode_Check(item)) {
				2716	PyObject *v;
				2717	v = PyUnicode_FromObject(item);
				2718	Py_DECREF(item);
				2719	item = v;
				2720	if (item == NULL)
				2721	goto onError;
				2722	}
				2723	itemlen = PyUnicode_GET_SIZE(item);
				2724	while (reslen + itemlen + seplen >= sz) {
				2725	if (_PyUnicode_Resize(res, sz*2))
				2726	goto onError;
				2727	sz *= 2;
				2728	p = PyUnicode_AS_UNICODE(res) + reslen;
				2729	}
				2730	if (i > 0) {
				2731	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2732	p += seplen;
				2733	reslen += seplen;
				2734	}
				2735	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2736	p += itemlen;
				2737	reslen += itemlen;
				2738	Py_DECREF(item);
				2739	}
				2740	if (_PyUnicode_Resize(res, reslen))
				2741	goto onError;
				2742
				2743	Py_XDECREF(separator);
				2744	return (PyObject *)res;
				2745
				2746	onError:
				2747	Py_XDECREF(separator);
				2748	Py_DECREF(res);
				2749	return NULL;
				2750	}
				2751
				2752	static
				2753	PyUnicodeObject pad(PyUnicodeObject self,
				2754	int left,
				2755	int right,
				2756	Py_UNICODE fill)
				2757	{
				2758	PyUnicodeObject *u;
				2759
				2760	if (left < 0)
				2761	left = 0;
				2762	if (right < 0)
				2763	right = 0;
				2764
				2765	if (left == 0 && right == 0) {
				2766	Py_INCREF(self);
				2767	return self;
				2768	}
				2769
				2770	u = _PyUnicode_New(left + self->length + right);
				2771	if (u) {
				2772	if (left)
				2773	Py_UNICODE_FILL(u->str, fill, left);
				2774	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2775	if (right)
				2776	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2777	}
				2778
				2779	return u;
				2780	}
				2781
				2782	#define SPLIT_APPEND(data, left, right) \
				2783	str = PyUnicode_FromUnicode(data + left, right - left); \
				2784	if (!str) \
				2785	goto onError; \
				2786	if (PyList_Append(list, str)) { \
				2787	Py_DECREF(str); \
				2788	goto onError; \
				2789	} \
				2790	else \
				2791	Py_DECREF(str);
				2792
				2793	static
				2794	PyObject split_whitespace(PyUnicodeObject self,
				2795	PyObject *list,
				2796	int maxcount)
				2797	{
				2798	register int i;
				2799	register int j;
				2800	int len = self->length;
				2801	PyObject *str;
				2802
				2803	for (i = j = 0; i < len; ) {
				2804	/* find a token */
				2805	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2806	i++;
				2807	j = i;
				2808	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2809	i++;
				2810	if (j < i) {
				2811	if (maxcount-- <= 0)
				2812	break;
				2813	SPLIT_APPEND(self->str, j, i);
				2814	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2815	i++;
				2816	j = i;
				2817	}
				2818	}
				2819	if (j < len) {
				2820	SPLIT_APPEND(self->str, j, len);
				2821	}
				2822	return list;
				2823
				2824	onError:
				2825	Py_DECREF(list);
				2826	return NULL;
				2827	}
				2828
				2829	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2830	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2831	{
				2832	register int i;
				2833	register int j;
				2834	int len;
				2835	PyObject *list;
				2836	PyObject *str;
				2837	Py_UNICODE *data;
				2838
				2839	string = PyUnicode_FromObject(string);
				2840	if (string == NULL)
				2841	return NULL;
				2842	data = PyUnicode_AS_UNICODE(string);
				2843	len = PyUnicode_GET_SIZE(string);
				2844
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2845	list = PyList_New(0);
				2846	if (!list)
				2847	goto onError;
				2848
				2849	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2850	int eol;
				2851
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2852	/* Find a line and append it */
				2853	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2854	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2855
				2856	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2857	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2858	if (i < len) {
				2859	if (data[i] == '\r' && i + 1 < len &&
				2860	data[i+1] == '\n')
				2861	i += 2;
				2862	else
				2863	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2864	if (keepends)
				2865	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2866	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2867	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2868	j = i;
				2869	}
				2870	if (j < len) {
				2871	SPLIT_APPEND(data, j, len);
				2872	}
				2873
				2874	Py_DECREF(string);
				2875	return list;
				2876
				2877	onError:
				2878	Py_DECREF(list);
				2879	Py_DECREF(string);
				2880	return NULL;
				2881	}
				2882
				2883	static
				2884	PyObject split_char(PyUnicodeObject self,
				2885	PyObject *list,
				2886	Py_UNICODE ch,
				2887	int maxcount)
				2888	{
				2889	register int i;
				2890	register int j;
				2891	int len = self->length;
				2892	PyObject *str;
				2893
				2894	for (i = j = 0; i < len; ) {
				2895	if (self->str[i] == ch) {
				2896	if (maxcount-- <= 0)
				2897	break;
				2898	SPLIT_APPEND(self->str, j, i);
				2899	i = j = i + 1;
				2900	} else
				2901	i++;
				2902	}
				2903	if (j <= len) {
				2904	SPLIT_APPEND(self->str, j, len);
				2905	}
				2906	return list;
				2907
				2908	onError:
				2909	Py_DECREF(list);
				2910	return NULL;
				2911	}
				2912
				2913	static
				2914	PyObject split_substring(PyUnicodeObject self,
				2915	PyObject *list,
				2916	PyUnicodeObject *substring,
				2917	int maxcount)
				2918	{
				2919	register int i;
				2920	register int j;
				2921	int len = self->length;
				2922	int sublen = substring->length;
				2923	PyObject *str;
				2924
				2925	for (i = j = 0; i < len - sublen; ) {
				2926	if (Py_UNICODE_MATCH(self, i, substring)) {
				2927	if (maxcount-- <= 0)
				2928	break;
				2929	SPLIT_APPEND(self->str, j, i);
				2930	i = j = i + sublen;
				2931	} else
				2932	i++;
				2933	}
				2934	if (j <= len) {
				2935	SPLIT_APPEND(self->str, j, len);
				2936	}
				2937	return list;
				2938
				2939	onError:
				2940	Py_DECREF(list);
				2941	return NULL;
				2942	}
				2943
				2944	#undef SPLIT_APPEND
				2945
				2946	static
				2947	PyObject split(PyUnicodeObject self,
				2948	PyUnicodeObject *substring,
				2949	int maxcount)
				2950	{
				2951	PyObject *list;
				2952
				2953	if (maxcount < 0)
				2954	maxcount = INT_MAX;
				2955
				2956	list = PyList_New(0);
				2957	if (!list)
				2958	return NULL;
				2959
				2960	if (substring == NULL)
				2961	return split_whitespace(self,list,maxcount);
				2962
				2963	else if (substring->length == 1)
				2964	return split_char(self,list,substring->str[0],maxcount);
				2965
				2966	else if (substring->length == 0) {
				2967	Py_DECREF(list);
				2968	PyErr_SetString(PyExc_ValueError, "empty separator");
				2969	return NULL;
				2970	}
				2971	else
				2972	return split_substring(self,list,substring,maxcount);
				2973	}
				2974
				2975	static
				2976	PyObject strip(PyUnicodeObject self,
				2977	int left,
				2978	int right)
				2979	{
				2980	Py_UNICODE *p = self->str;
				2981	int start = 0;
				2982	int end = self->length;
				2983
				2984	if (left)
				2985	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2986	start++;
				2987
				2988	if (right)
				2989	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2990	end--;
				2991
				2992	if (start == 0 && end == self->length) {
				2993	/* couldn't strip anything off, return original string */
				2994	Py_INCREF(self);
				2995	return (PyObject*) self;
				2996	}
				2997
				2998	return (PyObject*) PyUnicode_FromUnicode(
				2999	self->str + start,
				3000	end - start
				3001	);
				3002	}
				3003
				3004	static
				3005	PyObject replace(PyUnicodeObject self,
				3006	PyUnicodeObject *str1,
				3007	PyUnicodeObject *str2,
				3008	int maxcount)
				3009	{
				3010	PyUnicodeObject *u;
				3011
				3012	if (maxcount < 0)
				3013	maxcount = INT_MAX;
				3014
				3015	if (str1->length == 1 && str2->length == 1) {
				3016	int i;
				3017
				3018	/* replace characters */
				3019	if (!findchar(self->str, self->length, str1->str[0])) {
				3020	/* nothing to replace, return original string */
				3021	Py_INCREF(self);
				3022	u = self;
				3023	} else {
				3024	Py_UNICODE u1 = str1->str[0];
				3025	Py_UNICODE u2 = str2->str[0];
				3026
				3027	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3028	self->str,
				3029	self->length
				3030	);
				3031	if (u)
				3032	for (i = 0; i < u->length; i++)
				3033	if (u->str[i] == u1) {
				3034	if (--maxcount < 0)
				3035	break;
				3036	u->str[i] = u2;
				3037	}
				3038	}
				3039
				3040	} else {
				3041	int n, i;
				3042	Py_UNICODE *p;
				3043
				3044	/* replace strings */
				3045	n = count(self, 0, self->length, str1);
				3046	if (n > maxcount)
				3047	n = maxcount;
				3048	if (n == 0) {
				3049	/* nothing to replace, return original string */
				3050	Py_INCREF(self);
				3051	u = self;
				3052	} else {
				3053	u = _PyUnicode_New(
				3054	self->length + n * (str2->length - str1->length));
				3055	if (u) {
				3056	i = 0;
				3057	p = u->str;
				3058	while (i <= self->length - str1->length)
				3059	if (Py_UNICODE_MATCH(self, i, str1)) {
				3060	/* replace string segment */
				3061	Py_UNICODE_COPY(p, str2->str, str2->length);
				3062	p += str2->length;
				3063	i += str1->length;
				3064	if (--n <= 0) {
				3065	/* copy remaining part */
				3066	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3067	break;
				3068	}
				3069	} else
				3070	*p++ = self->str[i++];
				3071	}
				3072	}
				3073	}
				3074
				3075	return (PyObject *) u;
				3076	}
				3077
				3078	/* --- Unicode Object Methods --------------------------------------------- */
				3079
				3080	static char title__doc__[] =
				3081	"S.title() -> unicode\n\
				3082	\n\
				3083	Return a titlecased version of S, i.e. words start with title case\n\
				3084	characters, all remaining cased characters have lower case.";
				3085
				3086	static PyObject*
				3087	unicode_title(PyUnicodeObject self, PyObject args)
				3088	{
				3089	if (!PyArg_NoArgs(args))
				3090	return NULL;
				3091	return fixup(self, fixtitle);
				3092	}
				3093
				3094	static char capitalize__doc__[] =
				3095	"S.capitalize() -> unicode\n\
				3096	\n\
				3097	Return a capitalized version of S, i.e. make the first character\n\
				3098	have upper case.";
				3099
				3100	static PyObject*
				3101	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3102	{
				3103	if (!PyArg_NoArgs(args))
				3104	return NULL;
				3105	return fixup(self, fixcapitalize);
				3106	}
				3107
				3108	#if 0
				3109	static char capwords__doc__[] =
				3110	"S.capwords() -> unicode\n\
				3111	\n\
				3112	Apply .capitalize() to all words in S and return the result with\n\
				3113	normalized whitespace (all whitespace strings are replaced by ' ').";
				3114
				3115	static PyObject*
				3116	unicode_capwords(PyUnicodeObject self, PyObject args)
				3117	{
				3118	PyObject *list;
				3119	PyObject *item;
				3120	int i;
				3121
				3122	if (!PyArg_NoArgs(args))
				3123	return NULL;
				3124
				3125	/* Split into words */
				3126	list = split(self, NULL, -1);
				3127	if (!list)
				3128	return NULL;
				3129
				3130	/* Capitalize each word */
				3131	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3132	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3133	fixcapitalize);
				3134	if (item == NULL)
				3135	goto onError;
				3136	Py_DECREF(PyList_GET_ITEM(list, i));
				3137	PyList_SET_ITEM(list, i, item);
				3138	}
				3139
				3140	/* Join the words to form a new string */
				3141	item = PyUnicode_Join(NULL, list);
				3142
				3143	onError:
				3144	Py_DECREF(list);
				3145	return (PyObject *)item;
				3146	}
				3147	#endif
				3148
				3149	static char center__doc__[] =
				3150	"S.center(width) -> unicode\n\
				3151	\n\
				3152	Return S centered in a Unicode string of length width. Padding is done\n\
				3153	using spaces.";
				3154
				3155	static PyObject *
				3156	unicode_center(PyUnicodeObject self, PyObject args)
				3157	{
				3158	int marg, left;
				3159	int width;
				3160
				3161	if (!PyArg_ParseTuple(args, "i:center", &width))
				3162	return NULL;
				3163
				3164	if (self->length >= width) {
				3165	Py_INCREF(self);
				3166	return (PyObject*) self;
				3167	}
				3168
				3169	marg = width - self->length;
				3170	left = marg / 2 + (marg & width & 1);
				3171
				3172	return (PyObject*) pad(self, left, marg - left, ' ');
				3173	}
				3174
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3175	#if 0
				3176
				3177	/* This code should go into some future Unicode collation support
				3178	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3179	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3180
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3181	/* speedy UTF-16 code point order comparison */
				3182	/* gleaned from: */
				3183	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3184
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3185	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3186	{
				3187	0, 0, 0, 0, 0, 0, 0, 0,
				3188	0, 0, 0, 0, 0, 0, 0, 0,
				3189	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3190	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3191	};
				3192
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3193	static int
				3194	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3195	{
				3196	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3197
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3198	Py_UNICODE *s1 = str1->str;
				3199	Py_UNICODE *s2 = str2->str;
				3200
				3201	len1 = str1->length;
				3202	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3203
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3204	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3205	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3206	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3207
				3208	c1 = *s1++;
				3209	c2 = *s2++;
				3210	if (c1 > (1<<11) * 26)
				3211	c1 += utf16Fixup[c1>>11];
				3212	if (c2 > (1<<11) * 26)
				3213	c2 += utf16Fixup[c2>>11];
				3214
				3215	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3216	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3217	if (diff)
				3218	return (diff < 0) ? -1 : (diff != 0);
				3219	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3220	}
				3221
				3222	return (len1 < len2) ? -1 : (len1 != len2);
				3223	}
				3224
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3225	#else
				3226
				3227	static int
				3228	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3229	{
				3230	register int len1, len2;
				3231
				3232	Py_UNICODE *s1 = str1->str;
				3233	Py_UNICODE *s2 = str2->str;
				3234
				3235	len1 = str1->length;
				3236	len2 = str2->length;
				3237
				3238	while (len1 > 0 && len2 > 0) {
				3239	register long diff;
				3240
				3241	diff = (long)s1++ - (long)s2++;
				3242	if (diff)
				3243	return (diff < 0) ? -1 : (diff != 0);
				3244	len1--; len2--;
				3245	}
				3246
				3247	return (len1 < len2) ? -1 : (len1 != len2);
				3248	}
				3249
				3250	#endif
				3251
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3252	int PyUnicode_Compare(PyObject *left,
				3253	PyObject *right)
				3254	{
				3255	PyUnicodeObject u = NULL, v = NULL;
				3256	int result;
				3257
				3258	/* Coerce the two arguments */
				3259	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3260	if (u == NULL)
				3261	goto onError;
				3262	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3263	if (v == NULL)
				3264	goto onError;
				3265
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3266	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3267	if (v == u) {
				3268	Py_DECREF(u);
				3269	Py_DECREF(v);
				3270	return 0;
				3271	}
				3272
				3273	result = unicode_compare(u, v);
				3274
				3275	Py_DECREF(u);
				3276	Py_DECREF(v);
				3277	return result;
				3278
				3279	onError:
				3280	Py_XDECREF(u);
				3281	Py_XDECREF(v);
				3282	return -1;
				3283	}
				3284
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3285	int PyUnicode_Contains(PyObject *container,
				3286	PyObject *element)
				3287	{
				3288	PyUnicodeObject u = NULL, v = NULL;
				3289	int result;
				3290	register const Py_UNICODE p, e;
				3291	register Py_UNICODE ch;
				3292
				3293	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3294	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3295	if (v == NULL) {
				3296	PyErr_SetString(PyExc_TypeError,
				3297	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3298	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3299	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3300	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3301	if (u == NULL) {
				3302	Py_DECREF(v);
				3303	goto onError;
				3304	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3305
				3306	/* Check v in u */
				3307	if (PyUnicode_GET_SIZE(v) != 1) {
				3308	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3309	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3310	goto onError;
				3311	}
				3312	ch = *PyUnicode_AS_UNICODE(v);
				3313	p = PyUnicode_AS_UNICODE(u);
				3314	e = p + PyUnicode_GET_SIZE(u);
				3315	result = 0;
				3316	while (p < e) {
				3317	if (*p++ == ch) {
				3318	result = 1;
				3319	break;
				3320	}
				3321	}
				3322
				3323	Py_DECREF(u);
				3324	Py_DECREF(v);
				3325	return result;
				3326
				3327	onError:
				3328	Py_XDECREF(u);
				3329	Py_XDECREF(v);
				3330	return -1;
				3331	}
				3332
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3333	/* Concat to string or Unicode object giving a new Unicode object. */
				3334
				3335	PyObject PyUnicode_Concat(PyObject left,
				3336	PyObject *right)
				3337	{
				3338	PyUnicodeObject u = NULL, v = NULL, *w;
				3339
				3340	/* Coerce the two arguments */
				3341	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3342	if (u == NULL)
				3343	goto onError;
				3344	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3345	if (v == NULL)
				3346	goto onError;
				3347
				3348	/* Shortcuts */
				3349	if (v == unicode_empty) {
				3350	Py_DECREF(v);
				3351	return (PyObject *)u;
				3352	}
				3353	if (u == unicode_empty) {
				3354	Py_DECREF(u);
				3355	return (PyObject *)v;
				3356	}
				3357
				3358	/* Concat the two Unicode strings */
				3359	w = _PyUnicode_New(u->length + v->length);
				3360	if (w == NULL)
				3361	goto onError;
				3362	Py_UNICODE_COPY(w->str, u->str, u->length);
				3363	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3364
				3365	Py_DECREF(u);
				3366	Py_DECREF(v);
				3367	return (PyObject *)w;
				3368
				3369	onError:
				3370	Py_XDECREF(u);
				3371	Py_XDECREF(v);
				3372	return NULL;
				3373	}
				3374
				3375	static char count__doc__[] =
				3376	"S.count(sub[, start[, end]]) -> int\n\
				3377	\n\
				3378	Return the number of occurrences of substring sub in Unicode string\n\
				3379	S[start:end]. Optional arguments start and end are\n\
				3380	interpreted as in slice notation.";
				3381
				3382	static PyObject *
				3383	unicode_count(PyUnicodeObject self, PyObject args)
				3384	{
				3385	PyUnicodeObject *substring;
				3386	int start = 0;
				3387	int end = INT_MAX;
				3388	PyObject *result;
				3389
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3390	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3391	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3392	return NULL;
				3393
				3394	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3395	(PyObject *)substring);
				3396	if (substring == NULL)
				3397	return NULL;
				3398
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3399	if (start < 0)
				3400	start += self->length;
				3401	if (start < 0)
				3402	start = 0;
				3403	if (end > self->length)
				3404	end = self->length;
				3405	if (end < 0)
				3406	end += self->length;
				3407	if (end < 0)
				3408	end = 0;
				3409
				3410	result = PyInt_FromLong((long) count(self, start, end, substring));
				3411
				3412	Py_DECREF(substring);
				3413	return result;
				3414	}
				3415
				3416	static char encode__doc__[] =
				3417	"S.encode([encoding[,errors]]) -> string\n\
				3418	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3419	Return an encoded string version of S. Default encoding is the current\n\
				3420	default string encoding. errors may be given to set a different error\n\
				3421	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3422	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3423
				3424	static PyObject *
				3425	unicode_encode(PyUnicodeObject self, PyObject args)
				3426	{
				3427	char *encoding = NULL;
				3428	char *errors = NULL;
				3429	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3430	return NULL;
				3431	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3432	}
				3433
				3434	static char expandtabs__doc__[] =
				3435	"S.expandtabs([tabsize]) -> unicode\n\
				3436	\n\
				3437	Return a copy of S where all tab characters are expanded using spaces.\n\
				3438	If tabsize is not given, a tab size of 8 characters is assumed.";
				3439
				3440	static PyObject*
				3441	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3442	{
				3443	Py_UNICODE *e;
				3444	Py_UNICODE *p;
				3445	Py_UNICODE *q;
				3446	int i, j;
				3447	PyUnicodeObject *u;
				3448	int tabsize = 8;
				3449
				3450	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3451	return NULL;
				3452
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3453	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3454	i = j = 0;
				3455	e = self->str + self->length;
				3456	for (p = self->str; p < e; p++)
				3457	if (*p == '\t') {
				3458	if (tabsize > 0)
				3459	j += tabsize - (j % tabsize);
				3460	}
				3461	else {
				3462	j++;
				3463	if (p == '\n' \|\| p == '\r') {
				3464	i += j;
				3465	j = 0;
				3466	}
				3467	}
				3468
				3469	/* Second pass: create output string and fill it */
				3470	u = _PyUnicode_New(i + j);
				3471	if (!u)
				3472	return NULL;
				3473
				3474	j = 0;
				3475	q = u->str;
				3476
				3477	for (p = self->str; p < e; p++)
				3478	if (*p == '\t') {
				3479	if (tabsize > 0) {
				3480	i = tabsize - (j % tabsize);
				3481	j += i;
				3482	while (i--)
				3483	*q++ = ' ';
				3484	}
				3485	}
				3486	else {
				3487	j++;
				3488	q++ = p;
				3489	if (p == '\n' \|\| p == '\r')
				3490	j = 0;
				3491	}
				3492
				3493	return (PyObject*) u;
				3494	}
				3495
				3496	static char find__doc__[] =
				3497	"S.find(sub [,start [,end]]) -> int\n\
				3498	\n\
				3499	Return the lowest index in S where substring sub is found,\n\
				3500	such that sub is contained within s[start,end]. Optional\n\
				3501	arguments start and end are interpreted as in slice notation.\n\
				3502	\n\
				3503	Return -1 on failure.";
				3504
				3505	static PyObject *
				3506	unicode_find(PyUnicodeObject self, PyObject args)
				3507	{
				3508	PyUnicodeObject *substring;
				3509	int start = 0;
				3510	int end = INT_MAX;
				3511	PyObject *result;
				3512
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3513	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3514	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3515	return NULL;
				3516	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3517	(PyObject *)substring);
				3518	if (substring == NULL)
				3519	return NULL;
				3520
				3521	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3522
				3523	Py_DECREF(substring);
				3524	return result;
				3525	}
				3526
				3527	static PyObject *
				3528	unicode_getitem(PyUnicodeObject *self, int index)
				3529	{
				3530	if (index < 0 \|\| index >= self->length) {
				3531	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3532	return NULL;
				3533	}
				3534
				3535	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3536	}
				3537
				3538	static long
				3539	unicode_hash(PyUnicodeObject *self)
				3540	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3541	/* Since Unicode objects compare equal to their ASCII string
				3542	counterparts, they should use the individual character values
				3543	as basis for their hash value. This is needed to assure that
				3544	strings and Unicode objects behave in the same way as
				3545	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3546
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3547	register int len;
				3548	register Py_UNICODE *p;
				3549	register long x;
				3550
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3551	if (self->hash != -1)
				3552	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3553	len = PyUnicode_GET_SIZE(self);
				3554	p = PyUnicode_AS_UNICODE(self);
				3555	x = *p << 7;
				3556	while (--len >= 0)
				3557	x = (1000003x) ^ p++;
				3558	x ^= PyUnicode_GET_SIZE(self);
				3559	if (x == -1)
				3560	x = -2;
				3561	self->hash = x;
				3562	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3563	}
				3564
				3565	static char index__doc__[] =
				3566	"S.index(sub [,start [,end]]) -> int\n\
				3567	\n\
				3568	Like S.find() but raise ValueError when the substring is not found.";
				3569
				3570	static PyObject *
				3571	unicode_index(PyUnicodeObject self, PyObject args)
				3572	{
				3573	int result;
				3574	PyUnicodeObject *substring;
				3575	int start = 0;
				3576	int end = INT_MAX;
				3577
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3578	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3579	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3580	return NULL;
				3581
				3582	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3583	(PyObject *)substring);
				3584	if (substring == NULL)
				3585	return NULL;
				3586
				3587	result = findstring(self, substring, start, end, 1);
				3588
				3589	Py_DECREF(substring);
				3590	if (result < 0) {
				3591	PyErr_SetString(PyExc_ValueError, "substring not found");
				3592	return NULL;
				3593	}
				3594	return PyInt_FromLong(result);
				3595	}
				3596
				3597	static char islower__doc__[] =
				3598	"S.islower() -> int\n\
				3599	\n\
				3600	Return 1 if all cased characters in S are lowercase and there is\n\
				3601	at least one cased character in S, 0 otherwise.";
				3602
				3603	static PyObject*
				3604	unicode_islower(PyUnicodeObject self, PyObject args)
				3605	{
				3606	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3607	register const Py_UNICODE *e;
				3608	int cased;
				3609
				3610	if (!PyArg_NoArgs(args))
				3611	return NULL;
				3612
				3613	/* Shortcut for single character strings */
				3614	if (PyUnicode_GET_SIZE(self) == 1)
				3615	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3616
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3617	/* Special case for empty strings */
				3618	if (PyString_GET_SIZE(self) == 0)
				3619	return PyInt_FromLong(0);
				3620
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3621	e = p + PyUnicode_GET_SIZE(self);
				3622	cased = 0;
				3623	for (; p < e; p++) {
				3624	register const Py_UNICODE ch = *p;
				3625
				3626	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3627	return PyInt_FromLong(0);
				3628	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3629	cased = 1;
				3630	}
				3631	return PyInt_FromLong(cased);
				3632	}
				3633
				3634	static char isupper__doc__[] =
				3635	"S.isupper() -> int\n\
				3636	\n\
				3637	Return 1 if all cased characters in S are uppercase and there is\n\
				3638	at least one cased character in S, 0 otherwise.";
				3639
				3640	static PyObject*
				3641	unicode_isupper(PyUnicodeObject self, PyObject args)
				3642	{
				3643	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3644	register const Py_UNICODE *e;
				3645	int cased;
				3646
				3647	if (!PyArg_NoArgs(args))
				3648	return NULL;
				3649
				3650	/* Shortcut for single character strings */
				3651	if (PyUnicode_GET_SIZE(self) == 1)
				3652	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3653
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3654	/* Special case for empty strings */
				3655	if (PyString_GET_SIZE(self) == 0)
				3656	return PyInt_FromLong(0);
				3657
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3658	e = p + PyUnicode_GET_SIZE(self);
				3659	cased = 0;
				3660	for (; p < e; p++) {
				3661	register const Py_UNICODE ch = *p;
				3662
				3663	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3664	return PyInt_FromLong(0);
				3665	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3666	cased = 1;
				3667	}
				3668	return PyInt_FromLong(cased);
				3669	}
				3670
				3671	static char istitle__doc__[] =
				3672	"S.istitle() -> int\n\
				3673	\n\
				3674	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3675	may only follow uncased characters and lowercase characters only cased\n\
				3676	ones. Return 0 otherwise.";
				3677
				3678	static PyObject*
				3679	unicode_istitle(PyUnicodeObject self, PyObject args)
				3680	{
				3681	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3682	register const Py_UNICODE *e;
				3683	int cased, previous_is_cased;
				3684
				3685	if (!PyArg_NoArgs(args))
				3686	return NULL;
				3687
				3688	/* Shortcut for single character strings */
				3689	if (PyUnicode_GET_SIZE(self) == 1)
				3690	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3691	(Py_UNICODE_ISUPPER(*p) != 0));
				3692
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3693	/* Special case for empty strings */
				3694	if (PyString_GET_SIZE(self) == 0)
				3695	return PyInt_FromLong(0);
				3696
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3697	e = p + PyUnicode_GET_SIZE(self);
				3698	cased = 0;
				3699	previous_is_cased = 0;
				3700	for (; p < e; p++) {
				3701	register const Py_UNICODE ch = *p;
				3702
				3703	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3704	if (previous_is_cased)
				3705	return PyInt_FromLong(0);
				3706	previous_is_cased = 1;
				3707	cased = 1;
				3708	}
				3709	else if (Py_UNICODE_ISLOWER(ch)) {
				3710	if (!previous_is_cased)
				3711	return PyInt_FromLong(0);
				3712	previous_is_cased = 1;
				3713	cased = 1;
				3714	}
				3715	else
				3716	previous_is_cased = 0;
				3717	}
				3718	return PyInt_FromLong(cased);
				3719	}
				3720
				3721	static char isspace__doc__[] =
				3722	"S.isspace() -> int\n\
				3723	\n\
				3724	Return 1 if there are only whitespace characters in S,\n\
				3725	0 otherwise.";
				3726
				3727	static PyObject*
				3728	unicode_isspace(PyUnicodeObject self, PyObject args)
				3729	{
				3730	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3731	register const Py_UNICODE *e;
				3732
				3733	if (!PyArg_NoArgs(args))
				3734	return NULL;
				3735
				3736	/* Shortcut for single character strings */
				3737	if (PyUnicode_GET_SIZE(self) == 1 &&
				3738	Py_UNICODE_ISSPACE(*p))
				3739	return PyInt_FromLong(1);
				3740
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3741	/* Special case for empty strings */
				3742	if (PyString_GET_SIZE(self) == 0)
				3743	return PyInt_FromLong(0);
				3744
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3745	e = p + PyUnicode_GET_SIZE(self);
				3746	for (; p < e; p++) {
				3747	if (!Py_UNICODE_ISSPACE(*p))
				3748	return PyInt_FromLong(0);
				3749	}
				3750	return PyInt_FromLong(1);
				3751	}
				3752
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3753	static char isalpha__doc__[] =
				3754	"S.isalpha() -> int\n\
				3755	\n\
				3756	Return 1 if all characters in S are alphabetic\n\
				3757	and there is at least one character in S, 0 otherwise.";
				3758
				3759	static PyObject*
				3760	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3761	{
				3762	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3763	register const Py_UNICODE *e;
				3764
				3765	if (!PyArg_NoArgs(args))
				3766	return NULL;
				3767
				3768	/* Shortcut for single character strings */
				3769	if (PyUnicode_GET_SIZE(self) == 1 &&
				3770	Py_UNICODE_ISALPHA(*p))
				3771	return PyInt_FromLong(1);
				3772
				3773	/* Special case for empty strings */
				3774	if (PyString_GET_SIZE(self) == 0)
				3775	return PyInt_FromLong(0);
				3776
				3777	e = p + PyUnicode_GET_SIZE(self);
				3778	for (; p < e; p++) {
				3779	if (!Py_UNICODE_ISALPHA(*p))
				3780	return PyInt_FromLong(0);
				3781	}
				3782	return PyInt_FromLong(1);
				3783	}
				3784
				3785	static char isalnum__doc__[] =
				3786	"S.isalnum() -> int\n\
				3787	\n\
				3788	Return 1 if all characters in S are alphanumeric\n\
				3789	and there is at least one character in S, 0 otherwise.";
				3790
				3791	static PyObject*
				3792	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3793	{
				3794	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3795	register const Py_UNICODE *e;
				3796
				3797	if (!PyArg_NoArgs(args))
				3798	return NULL;
				3799
				3800	/* Shortcut for single character strings */
				3801	if (PyUnicode_GET_SIZE(self) == 1 &&
				3802	Py_UNICODE_ISALNUM(*p))
				3803	return PyInt_FromLong(1);
				3804
				3805	/* Special case for empty strings */
				3806	if (PyString_GET_SIZE(self) == 0)
				3807	return PyInt_FromLong(0);
				3808
				3809	e = p + PyUnicode_GET_SIZE(self);
				3810	for (; p < e; p++) {
				3811	if (!Py_UNICODE_ISALNUM(*p))
				3812	return PyInt_FromLong(0);
				3813	}
				3814	return PyInt_FromLong(1);
				3815	}
				3816
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3817	static char isdecimal__doc__[] =
				3818	"S.isdecimal() -> int\n\
				3819	\n\
				3820	Return 1 if there are only decimal characters in S,\n\
				3821	0 otherwise.";
				3822
				3823	static PyObject*
				3824	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3825	{
				3826	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3827	register const Py_UNICODE *e;
				3828
				3829	if (!PyArg_NoArgs(args))
				3830	return NULL;
				3831
				3832	/* Shortcut for single character strings */
				3833	if (PyUnicode_GET_SIZE(self) == 1 &&
				3834	Py_UNICODE_ISDECIMAL(*p))
				3835	return PyInt_FromLong(1);
				3836
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3837	/* Special case for empty strings */
				3838	if (PyString_GET_SIZE(self) == 0)
				3839	return PyInt_FromLong(0);
				3840
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3841	e = p + PyUnicode_GET_SIZE(self);
				3842	for (; p < e; p++) {
				3843	if (!Py_UNICODE_ISDECIMAL(*p))
				3844	return PyInt_FromLong(0);
				3845	}
				3846	return PyInt_FromLong(1);
				3847	}
				3848
				3849	static char isdigit__doc__[] =
				3850	"S.isdigit() -> int\n\
				3851	\n\
				3852	Return 1 if there are only digit characters in S,\n\
				3853	0 otherwise.";
				3854
				3855	static PyObject*
				3856	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3857	{
				3858	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3859	register const Py_UNICODE *e;
				3860
				3861	if (!PyArg_NoArgs(args))
				3862	return NULL;
				3863
				3864	/* Shortcut for single character strings */
				3865	if (PyUnicode_GET_SIZE(self) == 1 &&
				3866	Py_UNICODE_ISDIGIT(*p))
				3867	return PyInt_FromLong(1);
				3868
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3869	/* Special case for empty strings */
				3870	if (PyString_GET_SIZE(self) == 0)
				3871	return PyInt_FromLong(0);
				3872
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3873	e = p + PyUnicode_GET_SIZE(self);
				3874	for (; p < e; p++) {
				3875	if (!Py_UNICODE_ISDIGIT(*p))
				3876	return PyInt_FromLong(0);
				3877	}
				3878	return PyInt_FromLong(1);
				3879	}
				3880
				3881	static char isnumeric__doc__[] =
				3882	"S.isnumeric() -> int\n\
				3883	\n\
				3884	Return 1 if there are only numeric characters in S,\n\
				3885	0 otherwise.";
				3886
				3887	static PyObject*
				3888	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3889	{
				3890	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3891	register const Py_UNICODE *e;
				3892
				3893	if (!PyArg_NoArgs(args))
				3894	return NULL;
				3895
				3896	/* Shortcut for single character strings */
				3897	if (PyUnicode_GET_SIZE(self) == 1 &&
				3898	Py_UNICODE_ISNUMERIC(*p))
				3899	return PyInt_FromLong(1);
				3900
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3901	/* Special case for empty strings */
				3902	if (PyString_GET_SIZE(self) == 0)
				3903	return PyInt_FromLong(0);
				3904
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3905	e = p + PyUnicode_GET_SIZE(self);
				3906	for (; p < e; p++) {
				3907	if (!Py_UNICODE_ISNUMERIC(*p))
				3908	return PyInt_FromLong(0);
				3909	}
				3910	return PyInt_FromLong(1);
				3911	}
				3912
				3913	static char join__doc__[] =
				3914	"S.join(sequence) -> unicode\n\
				3915	\n\
				3916	Return a string which is the concatenation of the strings in the\n\
				3917	sequence. The separator between elements is S.";
				3918
				3919	static PyObject*
				3920	unicode_join(PyUnicodeObject self, PyObject args)
				3921	{
				3922	PyObject *data;
				3923	if (!PyArg_ParseTuple(args, "O:join", &data))
				3924	return NULL;
				3925
				3926	return PyUnicode_Join((PyObject *)self, data);
				3927	}
				3928
				3929	static int
				3930	unicode_length(PyUnicodeObject *self)
				3931	{
				3932	return self->length;
				3933	}
				3934
				3935	static char ljust__doc__[] =
				3936	"S.ljust(width) -> unicode\n\
				3937	\n\
				3938	Return S left justified in a Unicode string of length width. Padding is\n\
				3939	done using spaces.";
				3940
				3941	static PyObject *
				3942	unicode_ljust(PyUnicodeObject self, PyObject args)
				3943	{
				3944	int width;
				3945	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3946	return NULL;
				3947
				3948	if (self->length >= width) {
				3949	Py_INCREF(self);
				3950	return (PyObject*) self;
				3951	}
				3952
				3953	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3954	}
				3955
				3956	static char lower__doc__[] =
				3957	"S.lower() -> unicode\n\
				3958	\n\
				3959	Return a copy of the string S converted to lowercase.";
				3960
				3961	static PyObject*
				3962	unicode_lower(PyUnicodeObject self, PyObject args)
				3963	{
				3964	if (!PyArg_NoArgs(args))
				3965	return NULL;
				3966	return fixup(self, fixlower);
				3967	}
				3968
				3969	static char lstrip__doc__[] =
				3970	"S.lstrip() -> unicode\n\
				3971	\n\
				3972	Return a copy of the string S with leading whitespace removed.";
				3973
				3974	static PyObject *
				3975	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3976	{
				3977	if (!PyArg_NoArgs(args))
				3978	return NULL;
				3979	return strip(self, 1, 0);
				3980	}
				3981
				3982	static PyObject*
				3983	unicode_repeat(PyUnicodeObject *str, int len)
				3984	{
				3985	PyUnicodeObject *u;
				3986	Py_UNICODE *p;
				3987
				3988	if (len < 0)
				3989	len = 0;
				3990
				3991	if (len == 1) {
				3992	/* no repeat, return original string */
				3993	Py_INCREF(str);
				3994	return (PyObject*) str;
				3995	}
				3996
				3997	u = _PyUnicode_New(len * str->length);
				3998	if (!u)
				3999	return NULL;
				4000
				4001	p = u->str;
				4002
				4003	while (len-- > 0) {
				4004	Py_UNICODE_COPY(p, str->str, str->length);
				4005	p += str->length;
				4006	}
				4007
				4008	return (PyObject*) u;
				4009	}
				4010
				4011	PyObject PyUnicode_Replace(PyObject obj,
				4012	PyObject *subobj,
				4013	PyObject *replobj,
				4014	int maxcount)
				4015	{
				4016	PyObject *self;
				4017	PyObject *str1;
				4018	PyObject *str2;
				4019	PyObject *result;
				4020
				4021	self = PyUnicode_FromObject(obj);
				4022	if (self == NULL)
				4023	return NULL;
				4024	str1 = PyUnicode_FromObject(subobj);
				4025	if (str1 == NULL) {
				4026	Py_DECREF(self);
				4027	return NULL;
				4028	}
				4029	str2 = PyUnicode_FromObject(replobj);
				4030	if (str2 == NULL) {
				4031	Py_DECREF(self);
				4032	Py_DECREF(str1);
				4033	return NULL;
				4034	}
				4035	result = replace((PyUnicodeObject *)self,
				4036	(PyUnicodeObject *)str1,
				4037	(PyUnicodeObject *)str2,
				4038	maxcount);
				4039	Py_DECREF(self);
				4040	Py_DECREF(str1);
				4041	Py_DECREF(str2);
				4042	return result;
				4043	}
				4044
				4045	static char replace__doc__[] =
				4046	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4047	\n\
				4048	Return a copy of S with all occurrences of substring\n\
				4049	old replaced by new. If the optional argument maxsplit is\n\
				4050	given, only the first maxsplit occurrences are replaced.";
				4051
				4052	static PyObject*
				4053	unicode_replace(PyUnicodeObject self, PyObject args)
				4054	{
				4055	PyUnicodeObject *str1;
				4056	PyUnicodeObject *str2;
				4057	int maxcount = -1;
				4058	PyObject *result;
				4059
				4060	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4061	return NULL;
				4062	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4063	if (str1 == NULL)
				4064	return NULL;
				4065	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4066	if (str2 == NULL)
				4067	return NULL;
				4068
				4069	result = replace(self, str1, str2, maxcount);
				4070
				4071	Py_DECREF(str1);
				4072	Py_DECREF(str2);
				4073	return result;
				4074	}
				4075
				4076	static
				4077	PyObject unicode_repr(PyObject unicode)
				4078	{
				4079	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4080	PyUnicode_GET_SIZE(unicode),
				4081	1);
				4082	}
				4083
				4084	static char rfind__doc__[] =
				4085	"S.rfind(sub [,start [,end]]) -> int\n\
				4086	\n\
				4087	Return the highest index in S where substring sub is found,\n\
				4088	such that sub is contained within s[start,end]. Optional\n\
				4089	arguments start and end are interpreted as in slice notation.\n\
				4090	\n\
				4091	Return -1 on failure.";
				4092
				4093	static PyObject *
				4094	unicode_rfind(PyUnicodeObject self, PyObject args)
				4095	{
				4096	PyUnicodeObject *substring;
				4097	int start = 0;
				4098	int end = INT_MAX;
				4099	PyObject *result;
				4100
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4101	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4102	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4103	return NULL;
				4104	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4105	(PyObject *)substring);
				4106	if (substring == NULL)
				4107	return NULL;
				4108
				4109	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4110
				4111	Py_DECREF(substring);
				4112	return result;
				4113	}
				4114
				4115	static char rindex__doc__[] =
				4116	"S.rindex(sub [,start [,end]]) -> int\n\
				4117	\n\
				4118	Like S.rfind() but raise ValueError when the substring is not found.";
				4119
				4120	static PyObject *
				4121	unicode_rindex(PyUnicodeObject self, PyObject args)
				4122	{
				4123	int result;
				4124	PyUnicodeObject *substring;
				4125	int start = 0;
				4126	int end = INT_MAX;
				4127
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4128	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4129	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4130	return NULL;
				4131	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4132	(PyObject *)substring);
				4133	if (substring == NULL)
				4134	return NULL;
				4135
				4136	result = findstring(self, substring, start, end, -1);
				4137
				4138	Py_DECREF(substring);
				4139	if (result < 0) {
				4140	PyErr_SetString(PyExc_ValueError, "substring not found");
				4141	return NULL;
				4142	}
				4143	return PyInt_FromLong(result);
				4144	}
				4145
				4146	static char rjust__doc__[] =
				4147	"S.rjust(width) -> unicode\n\
				4148	\n\
				4149	Return S right justified in a Unicode string of length width. Padding is\n\
				4150	done using spaces.";
				4151
				4152	static PyObject *
				4153	unicode_rjust(PyUnicodeObject self, PyObject args)
				4154	{
				4155	int width;
				4156	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4157	return NULL;
				4158
				4159	if (self->length >= width) {
				4160	Py_INCREF(self);
				4161	return (PyObject*) self;
				4162	}
				4163
				4164	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4165	}
				4166
				4167	static char rstrip__doc__[] =
				4168	"S.rstrip() -> unicode\n\
				4169	\n\
				4170	Return a copy of the string S with trailing whitespace removed.";
				4171
				4172	static PyObject *
				4173	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4174	{
				4175	if (!PyArg_NoArgs(args))
				4176	return NULL;
				4177	return strip(self, 0, 1);
				4178	}
				4179
				4180	static PyObject*
				4181	unicode_slice(PyUnicodeObject *self, int start, int end)
				4182	{
				4183	/* standard clamping */
				4184	if (start < 0)
				4185	start = 0;
				4186	if (end < 0)
				4187	end = 0;
				4188	if (end > self->length)
				4189	end = self->length;
				4190	if (start == 0 && end == self->length) {
				4191	/* full slice, return original string */
				4192	Py_INCREF(self);
				4193	return (PyObject*) self;
				4194	}
				4195	if (start > end)
				4196	start = end;
				4197	/* copy slice */
				4198	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4199	end - start);
				4200	}
				4201
				4202	PyObject PyUnicode_Split(PyObject s,
				4203	PyObject *sep,
				4204	int maxsplit)
				4205	{
				4206	PyObject *result;
				4207
				4208	s = PyUnicode_FromObject(s);
				4209	if (s == NULL)
				4210	return NULL;
				4211	if (sep != NULL) {
				4212	sep = PyUnicode_FromObject(sep);
				4213	if (sep == NULL) {
				4214	Py_DECREF(s);
				4215	return NULL;
				4216	}
				4217	}
				4218
				4219	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4220
				4221	Py_DECREF(s);
				4222	Py_XDECREF(sep);
				4223	return result;
				4224	}
				4225
				4226	static char split__doc__[] =
				4227	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4228	\n\
				4229	Return a list of the words in S, using sep as the\n\
				4230	delimiter string. If maxsplit is given, at most maxsplit\n\
				4231	splits are done. If sep is not specified, any whitespace string\n\
				4232	is a separator.";
				4233
				4234	static PyObject*
				4235	unicode_split(PyUnicodeObject self, PyObject args)
				4236	{
				4237	PyObject *substring = Py_None;
				4238	int maxcount = -1;
				4239
				4240	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4241	return NULL;
				4242
				4243	if (substring == Py_None)
				4244	return split(self, NULL, maxcount);
				4245	else if (PyUnicode_Check(substring))
				4246	return split(self, (PyUnicodeObject *)substring, maxcount);
				4247	else
				4248	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4249	}
				4250
				4251	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4252	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4253	\n\
				4254	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4255	Line breaks are not included in the resulting list unless keepends\n\
				4256	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4257
				4258	static PyObject*
				4259	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4260	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4261	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4262
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4263	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4264	return NULL;
				4265
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4266	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4267	}
				4268
				4269	static
				4270	PyObject unicode_str(PyUnicodeObject self)
				4271	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4272	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4273	}
				4274
				4275	static char strip__doc__[] =
				4276	"S.strip() -> unicode\n\
				4277	\n\
				4278	Return a copy of S with leading and trailing whitespace removed.";
				4279
				4280	static PyObject *
				4281	unicode_strip(PyUnicodeObject self, PyObject args)
				4282	{
				4283	if (!PyArg_NoArgs(args))
				4284	return NULL;
				4285	return strip(self, 1, 1);
				4286	}
				4287
				4288	static char swapcase__doc__[] =
				4289	"S.swapcase() -> unicode\n\
				4290	\n\
				4291	Return a copy of S with uppercase characters converted to lowercase\n\
				4292	and vice versa.";
				4293
				4294	static PyObject*
				4295	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4296	{
				4297	if (!PyArg_NoArgs(args))
				4298	return NULL;
				4299	return fixup(self, fixswapcase);
				4300	}
				4301
				4302	static char translate__doc__[] =
				4303	"S.translate(table) -> unicode\n\
				4304	\n\
				4305	Return a copy of the string S, where all characters have been mapped\n\
				4306	through the given translation table, which must be a mapping of\n\
				4307	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4308	are left untouched. Characters mapped to None are deleted.";
				4309
				4310	static PyObject*
				4311	unicode_translate(PyUnicodeObject self, PyObject args)
				4312	{
				4313	PyObject *table;
				4314
				4315	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4316	return NULL;
				4317	return PyUnicode_TranslateCharmap(self->str,
				4318	self->length,
				4319	table,
				4320	"ignore");
				4321	}
				4322
				4323	static char upper__doc__[] =
				4324	"S.upper() -> unicode\n\
				4325	\n\
				4326	Return a copy of S converted to uppercase.";
				4327
				4328	static PyObject*
				4329	unicode_upper(PyUnicodeObject self, PyObject args)
				4330	{
				4331	if (!PyArg_NoArgs(args))
				4332	return NULL;
				4333	return fixup(self, fixupper);
				4334	}
				4335
				4336	#if 0
				4337	static char zfill__doc__[] =
				4338	"S.zfill(width) -> unicode\n\
				4339	\n\
				4340	Pad a numeric string x with zeros on the left, to fill a field\n\
				4341	of the specified width. The string x is never truncated.";
				4342
				4343	static PyObject *
				4344	unicode_zfill(PyUnicodeObject self, PyObject args)
				4345	{
				4346	int fill;
				4347	PyUnicodeObject *u;
				4348
				4349	int width;
				4350	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4351	return NULL;
				4352
				4353	if (self->length >= width) {
				4354	Py_INCREF(self);
				4355	return (PyObject*) self;
				4356	}
				4357
				4358	fill = width - self->length;
				4359
				4360	u = pad(self, fill, 0, '0');
				4361
				4362	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4363	/* move sign to beginning of string */
				4364	u->str[0] = u->str[fill];
				4365	u->str[fill] = '0';
				4366	}
				4367
				4368	return (PyObject*) u;
				4369	}
				4370	#endif
				4371
				4372	#if 0
				4373	static PyObject*
				4374	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4375	{
				4376	if (!PyArg_NoArgs(args))
				4377	return NULL;
				4378	return PyInt_FromLong(unicode_freelist_size);
				4379	}
				4380	#endif
				4381
				4382	static char startswith__doc__[] =
				4383	"S.startswith(prefix[, start[, end]]) -> int\n\
				4384	\n\
				4385	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4386	optional start, test S beginning at that position. With optional end, stop\n\
				4387	comparing S at that position.";
				4388
				4389	static PyObject *
				4390	unicode_startswith(PyUnicodeObject *self,
				4391	PyObject *args)
				4392	{
				4393	PyUnicodeObject *substring;
				4394	int start = 0;
				4395	int end = INT_MAX;
				4396	PyObject *result;
				4397
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4398	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4399	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4400	return NULL;
				4401	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4402	(PyObject *)substring);
				4403	if (substring == NULL)
				4404	return NULL;
				4405
				4406	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4407
				4408	Py_DECREF(substring);
				4409	return result;
				4410	}
				4411
				4412
				4413	static char endswith__doc__[] =
				4414	"S.endswith(suffix[, start[, end]]) -> int\n\
				4415	\n\
				4416	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4417	optional start, test S beginning at that position. With optional end, stop\n\
				4418	comparing S at that position.";
				4419
				4420	static PyObject *
				4421	unicode_endswith(PyUnicodeObject *self,
				4422	PyObject *args)
				4423	{
				4424	PyUnicodeObject *substring;
				4425	int start = 0;
				4426	int end = INT_MAX;
				4427	PyObject *result;
				4428
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4429	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4430	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4431	return NULL;
				4432	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4433	(PyObject *)substring);
				4434	if (substring == NULL)
				4435	return NULL;
				4436
				4437	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4438
				4439	Py_DECREF(substring);
				4440	return result;
				4441	}
				4442
				4443
				4444	static PyMethodDef unicode_methods[] = {
				4445
				4446	/* Order is according to common usage: often used methods should
				4447	appear first, since lookup is done sequentially. */
				4448
				4449	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4450	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4451	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4452	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4453	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4454	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4455	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4456	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4457	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4458	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4459	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4460	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4461	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4462	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4463	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4464	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4465	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4466	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4467	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4468	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4469	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4470	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4471	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4472	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4473	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4474	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4475	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4476	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4477	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4478	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4479	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4480	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4481	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4482	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4483	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4484	#if 0
				4485	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4486	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4487	#endif
				4488
				4489	#if 0
				4490	/* This one is just used for debugging the implementation. */
				4491	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4492	#endif
				4493
				4494	{NULL, NULL}
				4495	};
				4496
				4497	static PyObject *
				4498	unicode_getattr(PyUnicodeObject self, char name)
				4499	{
				4500	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4501	}
				4502
				4503	static PySequenceMethods unicode_as_sequence = {
				4504	(inquiry) unicode_length, /* sq_length */
				4505	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4506	(intargfunc) unicode_repeat, /* sq_repeat */
				4507	(intargfunc) unicode_getitem, /* sq_item */
				4508	(intintargfunc) unicode_slice, /* sq_slice */
				4509	0, /* sq_ass_item */
				4510	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4511	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4512	};
				4513
				4514	static int
				4515	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4516	int index,
				4517	const void **ptr)
				4518	{
				4519	if (index != 0) {
				4520	PyErr_SetString(PyExc_SystemError,
				4521	"accessing non-existent unicode segment");
				4522	return -1;
				4523	}
				4524	ptr = (void ) self->str;
				4525	return PyUnicode_GET_DATA_SIZE(self);
				4526	}
				4527
				4528	static int
				4529	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4530	const void **ptr)
				4531	{
				4532	PyErr_SetString(PyExc_TypeError,
				4533	"cannot use unicode as modifyable buffer");
				4534	return -1;
				4535	}
				4536
				4537	static int
				4538	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4539	int *lenp)
				4540	{
				4541	if (lenp)
				4542	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4543	return 1;
				4544	}
				4545
				4546	static int
				4547	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4548	int index,
				4549	const void **ptr)
				4550	{
				4551	PyObject *str;
				4552
				4553	if (index != 0) {
				4554	PyErr_SetString(PyExc_SystemError,
				4555	"accessing non-existent unicode segment");
				4556	return -1;
				4557	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4558	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4559	if (str == NULL)
				4560	return -1;
				4561	ptr = (void ) PyString_AS_STRING(str);
				4562	return PyString_GET_SIZE(str);
				4563	}
				4564
				4565	/* Helpers for PyUnicode_Format() */
				4566
				4567	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4568	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4569	{
				4570	int argidx = *p_argidx;
				4571	if (argidx < arglen) {
				4572	(*p_argidx)++;
				4573	if (arglen < 0)
				4574	return args;
				4575	else
				4576	return PyTuple_GetItem(args, argidx);
				4577	}
				4578	PyErr_SetString(PyExc_TypeError,
				4579	"not enough arguments for format string");
				4580	return NULL;
				4581	}
				4582
				4583	#define F_LJUST (1<<0)
				4584	#define F_SIGN (1<<1)
				4585	#define F_BLANK (1<<2)
				4586	#define F_ALT (1<<3)
				4587	#define F_ZERO (1<<4)
				4588
				4589	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4590	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4591	{
				4592	register int i;
				4593	int len;
				4594	va_list va;
				4595	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4596	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4597
				4598	/* First, format the string as char array, then expand to Py_UNICODE
				4599	array. */
				4600	charbuffer = (char *)buffer;
				4601	len = vsprintf(charbuffer, format, va);
				4602	for (i = len - 1; i >= 0; i--)
				4603	buffer[i] = (Py_UNICODE) charbuffer[i];
				4604
				4605	va_end(va);
				4606	return len;
				4607	}
				4608
				4609	static int
				4610	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4611	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4612	int flags,
				4613	int prec,
				4614	int type,
				4615	PyObject *v)
				4616	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4617	/* fmt = '%#.' + `prec` + `type`
				4618	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4619	char fmt[20];
				4620	double x;
				4621
				4622	x = PyFloat_AsDouble(v);
				4623	if (x == -1.0 && PyErr_Occurred())
				4624	return -1;
				4625	if (prec < 0)
				4626	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4627	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4628	type = 'g';
				4629	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4630	/* worst case length calc to ensure no buffer overrun:
				4631	fmt = %#.<prec>g
				4632	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4633	for any double rep.)
				4634	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4635	If prec=0 the effective precision is 1 (the leading digit is
				4636	always given), therefore increase by one to 10+prec. */
				4637	if (buflen <= (size_t)10 + (size_t)prec) {
				4638	PyErr_SetString(PyExc_OverflowError,
				4639	"formatted float is too long (precision too long?)");
				4640	return -1;
				4641	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4642	return usprintf(buf, fmt, x);
				4643	}
				4644
				4645	static int
				4646	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4647	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4648	int flags,
				4649	int prec,
				4650	int type,
				4651	PyObject *v)
				4652	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4653	/* fmt = '%#.' + `prec` + 'l' + `type`
				4654	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4655	char fmt[20];
				4656	long x;
				4657
				4658	x = PyInt_AsLong(v);
				4659	if (x == -1 && PyErr_Occurred())
				4660	return -1;
				4661	if (prec < 0)
				4662	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4663	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4664	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4665	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4666	PyErr_SetString(PyExc_OverflowError,
				4667	"formatted integer is too long (precision too long?)");
				4668	return -1;
				4669	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4670	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4671	return usprintf(buf, fmt, x);
				4672	}
				4673
				4674	static int
				4675	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4676	size_t buflen,
				4677	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4678	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4679	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4680	if (PyUnicode_Check(v)) {
				4681	if (PyUnicode_GET_SIZE(v) != 1)
				4682	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4683	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4684	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4685
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4686	else if (PyString_Check(v)) {
				4687	if (PyString_GET_SIZE(v) != 1)
				4688	goto onError;
				4689	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4690	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4691
				4692	else {
				4693	/* Integer input truncated to a character */
				4694	long x;
				4695	x = PyInt_AsLong(v);
				4696	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4697	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4698	buf[0] = (char) x;
				4699	}
				4700	buf[1] = '\0';
				4701	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4702
				4703	onError:
				4704	PyErr_SetString(PyExc_TypeError,
				4705	"%c requires int or char");
				4706	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4707	}
				4708
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4709	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4710
				4711	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4712	chars are formatted. XXX This is a magic number. Each formatting
				4713	routine does bounds checking to ensure no overflow, but a better
				4714	solution may be to malloc a buffer of appropriate size for each
				4715	format. For now, the current solution is sufficient.
				4716	*/
				4717	#define FORMATBUFLEN (size_t)120
				4718
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4719	PyObject PyUnicode_Format(PyObject format,
				4720	PyObject *args)
				4721	{
				4722	Py_UNICODE fmt, res;
				4723	int fmtcnt, rescnt, reslen, arglen, argidx;
				4724	int args_owned = 0;
				4725	PyUnicodeObject *result = NULL;
				4726	PyObject *dict = NULL;
				4727	PyObject *uformat;
				4728
				4729	if (format == NULL \|\| args == NULL) {
				4730	PyErr_BadInternalCall();
				4731	return NULL;
				4732	}
				4733	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4734	if (uformat == NULL)
				4735	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4736	fmt = PyUnicode_AS_UNICODE(uformat);
				4737	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4738
				4739	reslen = rescnt = fmtcnt + 100;
				4740	result = _PyUnicode_New(reslen);
				4741	if (result == NULL)
				4742	goto onError;
				4743	res = PyUnicode_AS_UNICODE(result);
				4744
				4745	if (PyTuple_Check(args)) {
				4746	arglen = PyTuple_Size(args);
				4747	argidx = 0;
				4748	}
				4749	else {
				4750	arglen = -1;
				4751	argidx = -2;
				4752	}
				4753	if (args->ob_type->tp_as_mapping)
				4754	dict = args;
				4755
				4756	while (--fmtcnt >= 0) {
				4757	if (*fmt != '%') {
				4758	if (--rescnt < 0) {
				4759	rescnt = fmtcnt + 100;
				4760	reslen += rescnt;
				4761	if (_PyUnicode_Resize(result, reslen) < 0)
				4762	return NULL;
				4763	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4764	--rescnt;
				4765	}
				4766	res++ = fmt++;
				4767	}
				4768	else {
				4769	/* Got a format specifier */
				4770	int flags = 0;
				4771	int width = -1;
				4772	int prec = -1;
				4773	int size = 0;
				4774	Py_UNICODE c = '\0';
				4775	Py_UNICODE fill;
				4776	PyObject *v = NULL;
				4777	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4778	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4779	Py_UNICODE sign;
				4780	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4781	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4782
				4783	fmt++;
				4784	if (*fmt == '(') {
				4785	Py_UNICODE *keystart;
				4786	int keylen;
				4787	PyObject *key;
				4788	int pcount = 1;
				4789
				4790	if (dict == NULL) {
				4791	PyErr_SetString(PyExc_TypeError,
				4792	"format requires a mapping");
				4793	goto onError;
				4794	}
				4795	++fmt;
				4796	--fmtcnt;
				4797	keystart = fmt;
				4798	/* Skip over balanced parentheses */
				4799	while (pcount > 0 && --fmtcnt >= 0) {
				4800	if (*fmt == ')')
				4801	--pcount;
				4802	else if (*fmt == '(')
				4803	++pcount;
				4804	fmt++;
				4805	}
				4806	keylen = fmt - keystart - 1;
				4807	if (fmtcnt < 0 \|\| pcount > 0) {
				4808	PyErr_SetString(PyExc_ValueError,
				4809	"incomplete format key");
				4810	goto onError;
				4811	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4812	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4813	then looked up since Python uses strings to hold
				4814	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4815	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4816	key = PyUnicode_EncodeUTF8(keystart,
				4817	keylen,
				4818	NULL);
				4819	if (key == NULL)
				4820	goto onError;
				4821	if (args_owned) {
				4822	Py_DECREF(args);
				4823	args_owned = 0;
				4824	}
				4825	args = PyObject_GetItem(dict, key);
				4826	Py_DECREF(key);
				4827	if (args == NULL) {
				4828	goto onError;
				4829	}
				4830	args_owned = 1;
				4831	arglen = -1;
				4832	argidx = -2;
				4833	}
				4834	while (--fmtcnt >= 0) {
				4835	switch (c = *fmt++) {
				4836	case '-': flags \|= F_LJUST; continue;
				4837	case '+': flags \|= F_SIGN; continue;
				4838	case ' ': flags \|= F_BLANK; continue;
				4839	case '#': flags \|= F_ALT; continue;
				4840	case '0': flags \|= F_ZERO; continue;
				4841	}
				4842	break;
				4843	}
				4844	if (c == '*') {
				4845	v = getnextarg(args, arglen, &argidx);
				4846	if (v == NULL)
				4847	goto onError;
				4848	if (!PyInt_Check(v)) {
				4849	PyErr_SetString(PyExc_TypeError,
				4850	"* wants int");
				4851	goto onError;
				4852	}
				4853	width = PyInt_AsLong(v);
				4854	if (width < 0) {
				4855	flags \|= F_LJUST;
				4856	width = -width;
				4857	}
				4858	if (--fmtcnt >= 0)
				4859	c = *fmt++;
				4860	}
				4861	else if (c >= '0' && c <= '9') {
				4862	width = c - '0';
				4863	while (--fmtcnt >= 0) {
				4864	c = *fmt++;
				4865	if (c < '0' \|\| c > '9')
				4866	break;
				4867	if ((width*10) / 10 != width) {
				4868	PyErr_SetString(PyExc_ValueError,
				4869	"width too big");
				4870	goto onError;
				4871	}
				4872	width = width*10 + (c - '0');
				4873	}
				4874	}
				4875	if (c == '.') {
				4876	prec = 0;
				4877	if (--fmtcnt >= 0)
				4878	c = *fmt++;
				4879	if (c == '*') {
				4880	v = getnextarg(args, arglen, &argidx);
				4881	if (v == NULL)
				4882	goto onError;
				4883	if (!PyInt_Check(v)) {
				4884	PyErr_SetString(PyExc_TypeError,
				4885	"* wants int");
				4886	goto onError;
				4887	}
				4888	prec = PyInt_AsLong(v);
				4889	if (prec < 0)
				4890	prec = 0;
				4891	if (--fmtcnt >= 0)
				4892	c = *fmt++;
				4893	}
				4894	else if (c >= '0' && c <= '9') {
				4895	prec = c - '0';
				4896	while (--fmtcnt >= 0) {
				4897	c = Py_CHARMASK(*fmt++);
				4898	if (c < '0' \|\| c > '9')
				4899	break;
				4900	if ((prec*10) / 10 != prec) {
				4901	PyErr_SetString(PyExc_ValueError,
				4902	"prec too big");
				4903	goto onError;
				4904	}
				4905	prec = prec*10 + (c - '0');
				4906	}
				4907	}
				4908	} /* prec */
				4909	if (fmtcnt >= 0) {
				4910	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4911	size = c;
				4912	if (--fmtcnt >= 0)
				4913	c = *fmt++;
				4914	}
				4915	}
				4916	if (fmtcnt < 0) {
				4917	PyErr_SetString(PyExc_ValueError,
				4918	"incomplete format");
				4919	goto onError;
				4920	}
				4921	if (c != '%') {
				4922	v = getnextarg(args, arglen, &argidx);
				4923	if (v == NULL)
				4924	goto onError;
				4925	}
				4926	sign = 0;
				4927	fill = ' ';
				4928	switch (c) {
				4929
				4930	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4931	pbuf = formatbuf;
				4932	/* presume that buffer length is at least 1 */
				4933	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4934	len = 1;
				4935	break;
				4936
				4937	case 's':
				4938	case 'r':
				4939	if (PyUnicode_Check(v) && c == 's') {
				4940	temp = v;
				4941	Py_INCREF(temp);
				4942	}
				4943	else {
				4944	PyObject *unicode;
				4945	if (c == 's')
				4946	temp = PyObject_Str(v);
				4947	else
				4948	temp = PyObject_Repr(v);
				4949	if (temp == NULL)
				4950	goto onError;
				4951	if (!PyString_Check(temp)) {
				4952	/* XXX Note: this should never happen, since
				4953	PyObject_Repr() and PyObject_Str() assure
				4954	this */
				4955	Py_DECREF(temp);
				4956	PyErr_SetString(PyExc_TypeError,
				4957	"%s argument has non-string str()");
				4958	goto onError;
				4959	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4960	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4961	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4962	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4963	"strict");
				4964	Py_DECREF(temp);
				4965	temp = unicode;
				4966	if (temp == NULL)
				4967	goto onError;
				4968	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4969	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4970	len = PyUnicode_GET_SIZE(temp);
				4971	if (prec >= 0 && len > prec)
				4972	len = prec;
				4973	break;
				4974
				4975	case 'i':
				4976	case 'd':
				4977	case 'u':
				4978	case 'o':
				4979	case 'x':
				4980	case 'X':
				4981	if (c == 'i')
				4982	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4983	pbuf = formatbuf;
				4984	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4985	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4986	if (len < 0)
				4987	goto onError;
				4988	sign = (c == 'd');
				4989	if (flags & F_ZERO) {
				4990	fill = '0';
				4991	if ((flags&F_ALT) &&
				4992	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4993	pbuf[0] == '0' && pbuf[1] == c) {
				4994	res++ = pbuf++;
				4995	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4996	rescnt -= 2;
				4997	len -= 2;
				4998	width -= 2;
				4999	if (width < 0)
				5000	width = 0;
				5001	}
				5002	}
				5003	break;
				5004
				5005	case 'e':
				5006	case 'E':
				5007	case 'f':
				5008	case 'g':
				5009	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5010	pbuf = formatbuf;
				5011	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5012	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5013	if (len < 0)
				5014	goto onError;
				5015	sign = 1;
				5016	if (flags&F_ZERO)
				5017	fill = '0';
				5018	break;
				5019
				5020	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5021	pbuf = formatbuf;
				5022	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5023	if (len < 0)
				5024	goto onError;
				5025	break;
				5026
				5027	default:
				5028	PyErr_Format(PyExc_ValueError,
				5029	"unsupported format character '%c' (0x%x)",
				5030	c, c);
				5031	goto onError;
				5032	}
				5033	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5034	if (pbuf == '-' \|\| pbuf == '+') {
				5035	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5036	len--;
				5037	}
				5038	else if (flags & F_SIGN)
				5039	sign = '+';
				5040	else if (flags & F_BLANK)
				5041	sign = ' ';
				5042	else
				5043	sign = 0;
				5044	}
				5045	if (width < len)
				5046	width = len;
				5047	if (rescnt < width + (sign != 0)) {
				5048	reslen -= rescnt;
				5049	rescnt = width + fmtcnt + 100;
				5050	reslen += rescnt;
				5051	if (_PyUnicode_Resize(result, reslen) < 0)
				5052	return NULL;
				5053	res = PyUnicode_AS_UNICODE(result)
				5054	+ reslen - rescnt;
				5055	}
				5056	if (sign) {
				5057	if (fill != ' ')
				5058	*res++ = sign;
				5059	rescnt--;
				5060	if (width > len)
				5061	width--;
				5062	}
				5063	if (width > len && !(flags & F_LJUST)) {
				5064	do {
				5065	--rescnt;
				5066	*res++ = fill;
				5067	} while (--width > len);
				5068	}
				5069	if (sign && fill == ' ')
				5070	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5071	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5072	res += len;
				5073	rescnt -= len;
				5074	while (--width >= len) {
				5075	--rescnt;
				5076	*res++ = ' ';
				5077	}
				5078	if (dict && (argidx < arglen) && c != '%') {
				5079	PyErr_SetString(PyExc_TypeError,
				5080	"not all arguments converted");
				5081	goto onError;
				5082	}
				5083	Py_XDECREF(temp);
				5084	} /* '%' */
				5085	} /* until end */
				5086	if (argidx < arglen && !dict) {
				5087	PyErr_SetString(PyExc_TypeError,
				5088	"not all arguments converted");
				5089	goto onError;
				5090	}
				5091
				5092	if (args_owned) {
				5093	Py_DECREF(args);
				5094	}
				5095	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5096	if (_PyUnicode_Resize(result, reslen - rescnt))
				5097	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5098	return (PyObject *)result;
				5099
				5100	onError:
				5101	Py_XDECREF(result);
				5102	Py_DECREF(uformat);
				5103	if (args_owned) {
				5104	Py_DECREF(args);
				5105	}
				5106	return NULL;
				5107	}
				5108
				5109	static PyBufferProcs unicode_as_buffer = {
				5110	(getreadbufferproc) unicode_buffer_getreadbuf,
				5111	(getwritebufferproc) unicode_buffer_getwritebuf,
				5112	(getsegcountproc) unicode_buffer_getsegcount,
				5113	(getcharbufferproc) unicode_buffer_getcharbuf,
				5114	};
				5115
				5116	PyTypeObject PyUnicode_Type = {
				5117	PyObject_HEAD_INIT(&PyType_Type)
				5118	0, /* ob_size */
				5119	"unicode", /* tp_name */
				5120	sizeof(PyUnicodeObject), /* tp_size */
				5121	0, /* tp_itemsize */
				5122	/* Slots */
				5123	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5124	0, /* tp_print */
				5125	(getattrfunc)unicode_getattr, /* tp_getattr */
				5126	0, /* tp_setattr */
				5127	(cmpfunc) unicode_compare, /* tp_compare */
				5128	(reprfunc) unicode_repr, /* tp_repr */
				5129	0, /* tp_as_number */
				5130	&unicode_as_sequence, /* tp_as_sequence */
				5131	0, /* tp_as_mapping */
				5132	(hashfunc) unicode_hash, /* tp_hash*/
				5133	0, /* tp_call*/
				5134	(reprfunc) unicode_str, /* tp_str */
				5135	(getattrofunc) NULL, /* tp_getattro */
				5136	(setattrofunc) NULL, /* tp_setattro */
				5137	&unicode_as_buffer, /* tp_as_buffer */
				5138	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5139	};
				5140
				5141	/* Initialize the Unicode implementation */
				5142
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5143	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5144	{
				5145	/* Doublecheck the configuration... */
				5146	if (sizeof(Py_UNICODE) != 2)
				5147	Py_FatalError("Unicode configuration error: "
				5148	"sizeof(Py_UNICODE) != 2 bytes");
				5149
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5150	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5151	unicode_freelist = NULL;
				5152	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5153	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5154	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5155	}
				5156
				5157	/* Finalize the Unicode implementation */
				5158
				5159	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5160	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5161	{
				5162	PyUnicodeObject *u = unicode_freelist;
				5163
				5164	while (u != NULL) {
				5165	PyUnicodeObject *v = u;
				5166	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5167	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5168	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5169	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5170	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5171	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5172	unicode_freelist = NULL;
				5173	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5174	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5175	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5176	}