Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: f4dc9bfe7e60ea822edd5a1f8a2fb7c97f192ab0 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	168	if (unicode->defenc) {
				169	Py_DECREF(unicode->defenc);
				170	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	216	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	217	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	219	/* Keep-Alive optimization: we only upsize the buffer,
				220	never downsize it. */
				221	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	223	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	224	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	227	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	229	}
				230	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	231	}
				232	else {
				233	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				234	if (unicode == NULL)
				235	return NULL;
				236	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				237	}
				238
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	239	if (!unicode->str) {
				240	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	241	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	unicode->str[length] = 0;
				244	unicode->length = length;
				245	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	246	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	248
				249	onError:
				250	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	251	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	252	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254
				255	static
				256	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				257	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	259	/* Keep-Alive optimization */
				260	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	261	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	unicode->str = NULL;
				263	unicode->length = 0;
				264	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	265	if (unicode->defenc) {
				266	Py_DECREF(unicode->defenc);
				267	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	268	}
				269	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	270	(PyUnicodeObject *)unicode = unicode_freelist;
				271	unicode_freelist = unicode;
				272	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	273	}
				274	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	275	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	276	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	277	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	278	}
				279	}
				280
				281	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				282	int size)
				283	{
				284	PyUnicodeObject *unicode;
				285
				286	unicode = _PyUnicode_New(size);
				287	if (!unicode)
				288	return NULL;
				289
				290	/* Copy the Unicode data into the new object */
				291	if (u != NULL)
				292	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	#ifdef HAVE_WCHAR_H
				298
				299	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				300	int size)
				301	{
				302	PyUnicodeObject *unicode;
				303
				304	if (w == NULL) {
				305	PyErr_BadInternalCall();
				306	return NULL;
				307	}
				308
				309	unicode = _PyUnicode_New(size);
				310	if (!unicode)
				311	return NULL;
				312
				313	/* Copy the wchar_t data into the new object */
				314	#ifdef HAVE_USABLE_WCHAR_T
				315	memcpy(unicode->str, w, size * sizeof(wchar_t));
				316	#else
				317	{
				318	register Py_UNICODE *u;
				319	register int i;
				320	u = PyUnicode_AS_UNICODE(unicode);
				321	for (i = size; i >= 0; i--)
				322	u++ = w++;
				323	}
				324	#endif
				325
				326	return (PyObject *)unicode;
				327	}
				328
				329	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				330	register wchar_t *w,
				331	int size)
				332	{
				333	if (unicode == NULL) {
				334	PyErr_BadInternalCall();
				335	return -1;
				336	}
				337	if (size > PyUnicode_GET_SIZE(unicode))
				338	size = PyUnicode_GET_SIZE(unicode);
				339	#ifdef HAVE_USABLE_WCHAR_T
				340	memcpy(w, unicode->str, size * sizeof(wchar_t));
				341	#else
				342	{
				343	register Py_UNICODE *u;
				344	register int i;
				345	u = PyUnicode_AS_UNICODE(unicode);
				346	for (i = size; i >= 0; i--)
				347	w++ = u++;
				348	}
				349	#endif
				350
				351	return size;
				352	}
				353
				354	#endif
				355
				356	PyObject PyUnicode_FromObject(register PyObject obj)
				357	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	358	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				359	}
				360
				361	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				362	const char *encoding,
				363	const char *errors)
				364	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	365	const char *s;
				366	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	367	int owned = 0;
				368	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	369
				370	if (obj == NULL) {
				371	PyErr_BadInternalCall();
				372	return NULL;
				373	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	374
				375	/* Coerce object */
				376	if (PyInstance_Check(obj)) {
				377	PyObject *func;
				378	func = PyObject_GetAttrString(obj, "__str__");
				379	if (func == NULL) {
				380	PyErr_SetString(PyExc_TypeError,
				381	"coercing to Unicode: instance doesn't define __str__");
				382	return NULL;
				383	}
				384	obj = PyEval_CallObject(func, NULL);
				385	Py_DECREF(func);
				386	if (obj == NULL)
				387	return NULL;
				388	owned = 1;
				389	}
				390	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	391	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	392	v = obj;
				393	if (encoding) {
				394	PyErr_SetString(PyExc_TypeError,
				395	"decoding Unicode is not supported");
				396	return NULL;
				397	}
				398	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399	}
				400	else if (PyString_Check(obj)) {
				401	s = PyString_AS_STRING(obj);
				402	len = PyString_GET_SIZE(obj);
				403	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	404	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				405	/* Overwrite the error message with something more useful in
				406	case of a TypeError. */
				407	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	408	PyErr_Format(PyExc_TypeError,
				409	"coercing to Unicode: need string or buffer, "
				410	"%.80s found",
				411	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414
				415	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	if (len == 0) {
				417	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	else
				421	v = PyUnicode_Decode(s, len, encoding, errors);
				422	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return v;
				427
				428	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	429	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	430	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	431	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	432	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	433	}
				434
				435	PyObject PyUnicode_Decode(const char s,
				436	int size,
				437	const char *encoding,
				438	const char *errors)
				439	{
				440	PyObject buffer = NULL, unicode;
				441
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	if (encoding == NULL)
				443	encoding = PyUnicode_GetDefaultEncoding();
				444
				445	/* Shortcuts for common default encodings */
				446	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	447	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	448	else if (strcmp(encoding, "latin-1") == 0)
				449	return PyUnicode_DecodeLatin1(s, size, errors);
				450	else if (strcmp(encoding, "ascii") == 0)
				451	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452
				453	/* Decode via the codec registry */
				454	buffer = PyBuffer_FromMemory((void *)s, size);
				455	if (buffer == NULL)
				456	goto onError;
				457	unicode = PyCodec_Decode(buffer, encoding, errors);
				458	if (unicode == NULL)
				459	goto onError;
				460	if (!PyUnicode_Check(unicode)) {
				461	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	462	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	463	unicode->ob_type->tp_name);
				464	Py_DECREF(unicode);
				465	goto onError;
				466	}
				467	Py_DECREF(buffer);
				468	return unicode;
				469
				470	onError:
				471	Py_XDECREF(buffer);
				472	return NULL;
				473	}
				474
				475	PyObject PyUnicode_Encode(const Py_UNICODE s,
				476	int size,
				477	const char *encoding,
				478	const char *errors)
				479	{
				480	PyObject v, unicode;
				481
				482	unicode = PyUnicode_FromUnicode(s, size);
				483	if (unicode == NULL)
				484	return NULL;
				485	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				486	Py_DECREF(unicode);
				487	return v;
				488	}
				489
				490	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				491	const char *encoding,
				492	const char *errors)
				493	{
				494	PyObject *v;
				495
				496	if (!PyUnicode_Check(unicode)) {
				497	PyErr_BadArgument();
				498	goto onError;
				499	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	500
				501	if (encoding == NULL)
				502	encoding = PyUnicode_GetDefaultEncoding();
				503
				504	/* Shortcuts for common default encodings */
				505	if (errors == NULL) {
				506	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	508	else if (strcmp(encoding, "latin-1") == 0)
				509	return PyUnicode_AsLatin1String(unicode);
				510	else if (strcmp(encoding, "ascii") == 0)
				511	return PyUnicode_AsASCIIString(unicode);
				512	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	513
				514	/* Encode via the codec registry */
				515	v = PyCodec_Encode(unicode, encoding, errors);
				516	if (v == NULL)
				517	goto onError;
				518	/* XXX Should we really enforce this ? */
				519	if (!PyString_Check(v)) {
				520	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	521	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	522	v->ob_type->tp_name);
				523	Py_DECREF(v);
				524	goto onError;
				525	}
				526	return v;
				527
				528	onError:
				529	return NULL;
				530	}
				531
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	532	/* Return a Python string holding the default encoded value of the
				533	Unicode object.
				534
				535	The resulting string is cached in the Unicode object for subsequent
				536	usage by this function. The cached version is needed to implement
				537	the character buffer interface and will live (at least) as long as
				538	the Unicode object itself.
				539
				540	The refcount of the string is not incremented.
				541
				542	* Exported for internal use by the interpreter only !!! *
				543
				544	*/
				545
				546	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				547	const char *errors)
				548	{
				549	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				550
				551	if (v)
				552	return v;
				553	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				554	if (v && errors == NULL)
				555	((PyUnicodeObject *)unicode)->defenc = v;
				556	return v;
				557	}
				558
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				560	{
				561	if (!PyUnicode_Check(unicode)) {
				562	PyErr_BadArgument();
				563	goto onError;
				564	}
				565	return PyUnicode_AS_UNICODE(unicode);
				566
				567	onError:
				568	return NULL;
				569	}
				570
				571	int PyUnicode_GetSize(PyObject *unicode)
				572	{
				573	if (!PyUnicode_Check(unicode)) {
				574	PyErr_BadArgument();
				575	goto onError;
				576	}
				577	return PyUnicode_GET_SIZE(unicode);
				578
				579	onError:
				580	return -1;
				581	}
				582
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	583	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	584	{
				585	return unicode_default_encoding;
				586	}
				587
				588	int PyUnicode_SetDefaultEncoding(const char *encoding)
				589	{
				590	PyObject *v;
				591
				592	/* Make sure the encoding is valid. As side effect, this also
				593	loads the encoding into the codec registry cache. */
				594	v = _PyCodec_Lookup(encoding);
				595	if (v == NULL)
				596	goto onError;
				597	Py_DECREF(v);
				598	strncpy(unicode_default_encoding,
				599	encoding,
				600	sizeof(unicode_default_encoding));
				601	return 0;
				602
				603	onError:
				604	return -1;
				605	}
				606
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	607	/* --- UTF-8 Codec -------------------------------------------------------- */
				608
				609	static
				610	char utf8_code_length[256] = {
				611	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				612	illegal prefix. see RFC 2279 for details */
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				616	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				617	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				618	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				619	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				620	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				621	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				622	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				623	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				624	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				625	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				626	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				627	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				628	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				629	};
				630
				631	static
				632	int utf8_decoding_error(const char **source,
				633	Py_UNICODE **dest,
				634	const char *errors,
				635	const char *details)
				636	{
				637	if ((errors == NULL) \|\|
				638	(strcmp(errors,"strict") == 0)) {
				639	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	640	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	641	details);
				642	return -1;
				643	}
				644	else if (strcmp(errors,"ignore") == 0) {
				645	(*source)++;
				646	return 0;
				647	}
				648	else if (strcmp(errors,"replace") == 0) {
				649	(*source)++;
				650	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				651	(*dest)++;
				652	return 0;
				653	}
				654	else {
				655	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	656	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	657	errors);
				658	return -1;
				659	}
				660	}
				661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	662	PyObject PyUnicode_DecodeUTF8(const char s,
				663	int size,
				664	const char *errors)
				665	{
				666	int n;
				667	const char *e;
				668	PyUnicodeObject *unicode;
				669	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	670	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	671
				672	/* Note: size will always be longer than the resulting Unicode
				673	character count */
				674	unicode = _PyUnicode_New(size);
				675	if (!unicode)
				676	return NULL;
				677	if (size == 0)
				678	return (PyObject *)unicode;
				679
				680	/* Unpack UTF-8 encoded data */
				681	p = unicode->str;
				682	e = s + size;
				683
				684	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	685	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686
				687	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	688	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	s++;
				690	continue;
				691	}
				692
				693	n = utf8_code_length[ch];
				694
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	695	if (s + n > e) {
				696	errmsg = "unexpected end of data";
				697	goto utf8Error;
				698	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699
				700	switch (n) {
				701
				702	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	703	errmsg = "unexpected code byte";
				704	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	705	break;
				706
				707	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	708	errmsg = "internal error";
				709	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	710	break;
				711
				712	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	if ((s[1] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	717	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	if (ch < 0x80) {
				719	errmsg = "illegal encoding";
				720	goto utf8Error;
				721	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	722	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	723	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	724	break;
				725
				726	case 3:
				727	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	728	(s[2] & 0xc0) != 0x80) {
				729	errmsg = "invalid data";
				730	goto utf8Error;
				731	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	732	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	733	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				734	errmsg = "illegal encoding";
				735	goto utf8Error;
				736	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	738	*p++ = (Py_UNICODE)ch;
				739	break;
				740
				741	case 4:
				742	if ((s[1] & 0xc0) != 0x80 \|\|
				743	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	(s[3] & 0xc0) != 0x80) {
				745	errmsg = "invalid data";
				746	goto utf8Error;
				747	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	748	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				749	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				750	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	751	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				752	byte encoding */
				753	(ch > 0x10ffff)) { /* maximum value allowed for
				754	UTF-16 */
				755	errmsg = "illegal encoding";
				756	goto utf8Error;
				757	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	758	/* compute and append the two surrogates: */
				759
				760	/* translate from 10000..10FFFF to 0..FFFF */
				761	ch -= 0x10000;
				762
				763	/* high surrogate = top 10 bits added to D800 */
				764	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				765
				766	/* low surrogate = bottom 10 bits added to DC00 */
				767	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	768	break;
				769
				770	default:
				771	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	772	errmsg = "unsupported Unicode code range";
				773	goto utf8Error;
				774	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775	}
				776	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	777	continue;
				778
				779	utf8Error:
				780	if (utf8_decoding_error(&s, &p, errors, errmsg))
				781	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	782	}
				783
				784	/* Adjust length */
				785	if (_PyUnicode_Resize(unicode, p - unicode->str))
				786	goto onError;
				787
				788	return (PyObject *)unicode;
				789
				790	onError:
				791	Py_DECREF(unicode);
				792	return NULL;
				793	}
				794
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	795	/* Not used anymore, now that the encoder supports UTF-16
				796	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	797	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	798	static
				799	int utf8_encoding_error(const Py_UNICODE **source,
				800	char **dest,
				801	const char *errors,
				802	const char *details)
				803	{
				804	if ((errors == NULL) \|\|
				805	(strcmp(errors,"strict") == 0)) {
				806	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	807	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	details);
				809	return -1;
				810	}
				811	else if (strcmp(errors,"ignore") == 0) {
				812	return 0;
				813	}
				814	else if (strcmp(errors,"replace") == 0) {
				815	**dest = '?';
				816	(*dest)++;
				817	return 0;
				818	}
				819	else {
				820	PyErr_Format(PyExc_ValueError,
				821	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	822	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	errors);
				824	return -1;
				825	}
				826	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	827	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	828
				829	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				830	int size,
				831	const char *errors)
				832	{
				833	PyObject *v;
				834	char *p;
				835	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	836	Py_UCS4 ch2;
				837	unsigned int cbAllocated = 3 * size;
				838	unsigned int cbWritten = 0;
				839	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	842	if (v == NULL)
				843	return NULL;
				844	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	845	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	846
				847	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	while (i < size) {
				849	Py_UCS4 ch = s[i++];
				850	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	851	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	852	cbWritten++;
				853	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	854	else if (ch < 0x0800) {
				855	*p++ = 0xc0 \| (ch >> 6);
				856	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	857	cbWritten += 2;
				858	}
				859	else {
				860	/* Check for high surrogate */
				861	if (0xD800 <= ch && ch <= 0xDBFF) {
				862	if (i != size) {
				863	ch2 = s[i];
				864	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				865
				866	if (cbWritten >= (cbAllocated - 4)) {
				867	/* Provide enough room for some more
				868	surrogates */
				869	cbAllocated += 4*10;
				870	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	871	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	872	}
				873
				874	/* combine the two values */
				875	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				876
				877	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	878	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	879	i++;
				880	cbWritten += 4;
				881	}
				882	}
				883	}
				884	else {
				885	*p++ = (char)(0xe0 \| (ch >> 12));
				886	cbWritten += 3;
				887	}
				888	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				889	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	}
				891	}
				892	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	893	if (_PyString_Resize(&v, p - q))
				894	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	895	return v;
				896
				897	onError:
				898	Py_DECREF(v);
				899	return NULL;
				900	}
				901
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	902	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				903	{
				904	PyObject *str;
				905
				906	if (!PyUnicode_Check(unicode)) {
				907	PyErr_BadArgument();
				908	return NULL;
				909	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	910	str = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				911	PyUnicode_GET_SIZE(unicode),
				912	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	913	if (str == NULL)
				914	return NULL;
				915	Py_INCREF(str);
				916	return str;
				917	}
				918
				919	/* --- UTF-16 Codec ------------------------------------------------------- */
				920
				921	static
				922	int utf16_decoding_error(const Py_UNICODE **source,
				923	Py_UNICODE **dest,
				924	const char *errors,
				925	const char *details)
				926	{
				927	if ((errors == NULL) \|\|
				928	(strcmp(errors,"strict") == 0)) {
				929	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	930	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	931	details);
				932	return -1;
				933	}
				934	else if (strcmp(errors,"ignore") == 0) {
				935	return 0;
				936	}
				937	else if (strcmp(errors,"replace") == 0) {
				938	if (dest) {
				939	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				940	(*dest)++;
				941	}
				942	return 0;
				943	}
				944	else {
				945	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	946	"UTF-16 decoding error; "
				947	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	948	errors);
				949	return -1;
				950	}
				951	}
				952
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	953	PyObject PyUnicode_DecodeUTF16(const char s,
				954	int size,
				955	const char *errors,
				956	int *byteorder)
				957	{
				958	PyUnicodeObject *unicode;
				959	Py_UNICODE *p;
				960	const Py_UNICODE q, e;
				961	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	962	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	963
				964	/* size should be an even number */
				965	if (size % sizeof(Py_UNICODE) != 0) {
				966	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				967	return NULL;
				968	/* The remaining input chars are ignored if we fall through
				969	here... */
				970	}
				971
				972	/* Note: size will always be longer than the resulting Unicode
				973	character count */
				974	unicode = _PyUnicode_New(size);
				975	if (!unicode)
				976	return NULL;
				977	if (size == 0)
				978	return (PyObject *)unicode;
				979
				980	/* Unpack UTF-16 encoded data */
				981	p = unicode->str;
				982	q = (Py_UNICODE *)s;
				983	e = q + (size / sizeof(Py_UNICODE));
				984
				985	if (byteorder)
				986	bo = *byteorder;
				987
				988	while (q < e) {
				989	register Py_UNICODE ch = *q++;
				990
				991	/* Check for BOM marks (U+FEFF) in the input and adjust
				992	current byte order setting accordingly. Swap input
				993	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				994	!) */
				995	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				996	if (ch == 0xFEFF) {
				997	bo = -1;
				998	continue;
				999	} else if (ch == 0xFFFE) {
				1000	bo = 1;
				1001	continue;
				1002	}
				1003	if (bo == 1)
				1004	ch = (ch >> 8) \| (ch << 8);
				1005	#else
				1006	if (ch == 0xFEFF) {
				1007	bo = 1;
				1008	continue;
				1009	} else if (ch == 0xFFFE) {
				1010	bo = -1;
				1011	continue;
				1012	}
				1013	if (bo == -1)
				1014	ch = (ch >> 8) \| (ch << 8);
				1015	#endif
				1016	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1017	*p++ = ch;
				1018	continue;
				1019	}
				1020
				1021	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1022	if (q >= e) {
				1023	errmsg = "unexpected end of data";
				1024	goto utf16Error;
				1025	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1026	if (0xDC00 <= q && q <= 0xDFFF) {
				1027	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1028	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1029	/* This is valid data (a UTF-16 surrogate pair), but
				1030	we are not able to store this information since our
				1031	Py_UNICODE type only has 16 bits... this might
				1032	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1033	errmsg = "code pairs are not supported";
				1034	goto utf16Error;
				1035	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1036	else
				1037	continue;
				1038	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1039	errmsg = "illegal encoding";
				1040	/* Fall through to report the error */
				1041
				1042	utf16Error:
				1043	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1044	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1045	}
				1046
				1047	if (byteorder)
				1048	*byteorder = bo;
				1049
				1050	/* Adjust length */
				1051	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1052	goto onError;
				1053
				1054	return (PyObject *)unicode;
				1055
				1056	onError:
				1057	Py_DECREF(unicode);
				1058	return NULL;
				1059	}
				1060
				1061	#undef UTF16_ERROR
				1062
				1063	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1064	int size,
				1065	const char *errors,
				1066	int byteorder)
				1067	{
				1068	PyObject *v;
				1069	Py_UNICODE *p;
				1070	char *q;
				1071
				1072	/* We don't create UTF-16 pairs... */
				1073	v = PyString_FromStringAndSize(NULL,
				1074	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1075	if (v == NULL)
				1076	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1077
				1078	q = PyString_AS_STRING(v);
				1079	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	if (byteorder == 0)
				1081	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1082	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	1083	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1084	if (byteorder == 0 \|\|
				1085	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1086	byteorder == -1
				1087	#else
				1088	byteorder == 1
				1089	#endif
				1090	)
				1091	memcpy(p, s, size * sizeof(Py_UNICODE));
				1092	else
				1093	while (size-- > 0) {
				1094	Py_UNICODE ch = *s++;
				1095	*p++ = (ch >> 8) \| (ch << 8);
				1096	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1097	return v;
				1098	}
				1099
				1100	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1101	{
				1102	if (!PyUnicode_Check(unicode)) {
				1103	PyErr_BadArgument();
				1104	return NULL;
				1105	}
				1106	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1107	PyUnicode_GET_SIZE(unicode),
				1108	NULL,
				1109	0);
				1110	}
				1111
				1112	/* --- Unicode Escape Codec ----------------------------------------------- */
				1113
				1114	static
				1115	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1116	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1117	const char *errors,
				1118	const char *details)
				1119	{
				1120	if ((errors == NULL) \|\|
				1121	(strcmp(errors,"strict") == 0)) {
				1122	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1123	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1124	details);
				1125	return -1;
				1126	}
				1127	else if (strcmp(errors,"ignore") == 0) {
				1128	return 0;
				1129	}
				1130	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1131	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	return 0;
				1133	}
				1134	else {
				1135	PyErr_Format(PyExc_ValueError,
				1136	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1137	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1138	errors);
				1139	return -1;
				1140	}
				1141	}
				1142
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1143	static _Py_UCNHashAPI *pucnHash = NULL;
				1144
				1145	static
				1146	int mystrnicmp(const char s1, const char s2, size_t count)
				1147	{
				1148	char c1, c2;
				1149
				1150	if (count)
				1151	{
				1152	do
				1153	{
				1154	c1 = tolower(*(s1++));
				1155	c2 = tolower(*(s2++));
				1156	}
				1157	while(--count && c1 == c2);
				1158
				1159	return c1 - c2;
				1160	}
				1161
				1162	return 0;
				1163	}
				1164
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1165	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1166	int size,
				1167	const char *errors)
				1168	{
				1169	PyUnicodeObject *v;
				1170	Py_UNICODE p = NULL, buf = NULL;
				1171	const char *end;
				1172
				1173	/* Escaped strings will always be longer than the resulting
				1174	Unicode string, so we start with size here and then reduce the
				1175	length after conversion to the true value. */
				1176	v = _PyUnicode_New(size);
				1177	if (v == NULL)
				1178	goto onError;
				1179	if (size == 0)
				1180	return (PyObject *)v;
				1181	p = buf = PyUnicode_AS_UNICODE(v);
				1182	end = s + size;
				1183	while (s < end) {
				1184	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1185	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1186	int i;
				1187
				1188	/* Non-escape characters are interpreted as Unicode ordinals */
				1189	if (*s != '\\') {
				1190	p++ = (unsigned char)s++;
				1191	continue;
				1192	}
				1193
				1194	/* \ - Escapes */
				1195	s++;
				1196	switch (*s++) {
				1197
				1198	/* \x escapes */
				1199	case '\n': break;
				1200	case '\\': *p++ = '\\'; break;
				1201	case '\'': *p++ = '\''; break;
				1202	case '\"': *p++ = '\"'; break;
				1203	case 'b': *p++ = '\b'; break;
				1204	case 'f': p++ = '\014'; break; / FF */
				1205	case 't': *p++ = '\t'; break;
				1206	case 'n': *p++ = '\n'; break;
				1207	case 'r': *p++ = '\r'; break;
				1208	case 'v': p++ = '\013'; break; / VT */
				1209	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1210
				1211	/* \OOO (octal) escapes */
				1212	case '0': case '1': case '2': case '3':
				1213	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1214	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1215	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1216	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1217	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1218	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1219	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1220	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1221	break;
				1222
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1223	/* \xXXXX escape with 1-n hex digits. for compatibility
				1224	with 8-bit strings, this code ignores all but the last
				1225	two digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1226	case 'x':
				1227	x = 0;
				1228	c = (unsigned char)*s;
				1229	if (isxdigit(c)) {
				1230	do {
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1231	x = (x<<4) & 0xF0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1232	if ('0' <= c && c <= '9')
				1233	x += c - '0';
				1234	else if ('a' <= c && c <= 'f')
				1235	x += 10 + c - 'a';
				1236	else
				1237	x += 10 + c - 'A';
				1238	c = (unsigned char)*++s;
				1239	} while (isxdigit(c));
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1240	*p++ = (unsigned char) x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1241	} else {
				1242	*p++ = '\\';
				1243	*p++ = (unsigned char)s[-1];
				1244	}
				1245	break;
				1246
				1247	/* \uXXXX with 4 hex digits */
				1248	case 'u':
				1249	for (x = 0, i = 0; i < 4; i++) {
				1250	c = (unsigned char)s[i];
				1251	if (!isxdigit(c)) {
				1252	if (unicodeescape_decoding_error(&s, &x, errors,
				1253	"truncated \\uXXXX"))
				1254	goto onError;
				1255	i++;
				1256	break;
				1257	}
				1258	x = (x<<4) & ~0xF;
				1259	if (c >= '0' && c <= '9')
				1260	x += c - '0';
				1261	else if (c >= 'a' && c <= 'f')
				1262	x += 10 + c - 'a';
				1263	else
				1264	x += 10 + c - 'A';
				1265	}
				1266	s += i;
				1267	*p++ = x;
				1268	break;
				1269
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1270	case 'N':
				1271	/* Ok, we need to deal with Unicode Character Names now,
				1272	* make sure we've imported the hash table data...
				1273	*/
				1274	if (pucnHash == NULL)
				1275	{
				1276	PyObject mod = 0, v = 0;
				1277
				1278	mod = PyImport_ImportModule("ucnhash");
				1279	if (mod == NULL)
				1280	goto onError;
				1281	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1282	Py_DECREF(mod);
				1283	if (v == NULL)
				1284	{
				1285	goto onError;
				1286	}
				1287	pucnHash = PyCObject_AsVoidPtr(v);
				1288	Py_DECREF(v);
				1289	if (pucnHash == NULL)
				1290	{
				1291	goto onError;
				1292	}
				1293	}
				1294
				1295	if (*s == '{')
				1296	{
				1297	const char *start = s + 1;
				1298	const char *endBrace = start;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1299	Py_UCS4 value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1300	unsigned long j;
				1301
				1302	/* look for either the closing brace, or we
				1303	* exceed the maximum length of the unicode character names
				1304	*/
				1305	while (*endBrace != '}' &&
				1306	(unsigned int)(endBrace - start) <=
				1307	pucnHash->cchMax &&
				1308	endBrace < end)
				1309	{
				1310	endBrace++;
				1311	}
				1312	if (endBrace != end && *endBrace == '}')
				1313	{
				1314	j = pucnHash->hash(start, endBrace - start);
				1315	if (j > pucnHash->cKeys \|\|
				1316	mystrnicmp(
				1317	start,
				1318	((_Py_UnicodeCharacterName *)
				1319	(pucnHash->getValue(j)))->pszUCN,
				1320	(int)(endBrace - start)) != 0)
				1321	{
				1322	if (unicodeescape_decoding_error(
				1323	&s, &x, errors,
				1324	"Invalid Unicode Character Name"))
				1325	{
				1326	goto onError;
				1327	}
				1328	goto ucnFallthrough;
				1329	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1330	value = ((_Py_UnicodeCharacterName *)
				1331	(pucnHash->getValue(j)))->value;
				1332	if (value < 1<<16)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1333	{
				1334	/* In UCS-2 range, easy solution.. */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1335	*p++ = value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1336	}
				1337	else
				1338	{
				1339	/* Oops, its in UCS-4 space, */
				1340	/* compute and append the two surrogates: */
				1341	/* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1342	value -= 0x10000;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1343
				1344	/* high surrogate = top 10 bits added to D800 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1345	*p++ = 0xD800 + (value >> 10);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1346
				1347	/* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1348	*p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1349	}
				1350	s = endBrace + 1;
				1351	}
				1352	else
				1353	{
				1354	if (unicodeescape_decoding_error(
				1355	&s, &x, errors,
				1356	"Unicode name missing closing brace"))
				1357	goto onError;
				1358	goto ucnFallthrough;
				1359	}
				1360	break;
				1361	}
				1362	if (unicodeescape_decoding_error(
				1363	&s, &x, errors,
				1364	"Missing opening brace for Unicode Character Name escape"))
				1365	goto onError;
				1366	ucnFallthrough:
				1367	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1368	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1369	*p++ = '\\';
				1370	*p++ = (unsigned char)s[-1];
				1371	break;
				1372	}
				1373	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1374	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1375	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1376	return (PyObject *)v;
				1377
				1378	onError:
				1379	Py_XDECREF(v);
				1380	return NULL;
				1381	}
				1382
				1383	/* Return a Unicode-Escape string version of the Unicode object.
				1384
				1385	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1386	appropriate.
				1387
				1388	*/
				1389
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1390	static const Py_UNICODE findchar(const Py_UNICODE s,
				1391	int size,
				1392	Py_UNICODE ch);
				1393
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1394	static
				1395	PyObject unicodeescape_string(const Py_UNICODE s,
				1396	int size,
				1397	int quotes)
				1398	{
				1399	PyObject *repr;
				1400	char *p;
				1401	char *q;
				1402
				1403	static const char *hexdigit = "0123456789ABCDEF";
				1404
				1405	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1406	if (repr == NULL)
				1407	return NULL;
				1408
				1409	p = q = PyString_AS_STRING(repr);
				1410
				1411	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1412	*p++ = 'u';
				1413	*p++ = (findchar(s, size, '\'') &&
				1414	!findchar(s, size, '"')) ? '"' : '\'';
				1415	}
				1416	while (size-- > 0) {
				1417	Py_UNICODE ch = *s++;
				1418	/* Escape quotes */
				1419	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1420	*p++ = '\\';
				1421	*p++ = (char) ch;
				1422	}
				1423	/* Map 16-bit characters to '\uxxxx' */
				1424	else if (ch >= 256) {
				1425	*p++ = '\\';
				1426	*p++ = 'u';
				1427	*p++ = hexdigit[(ch >> 12) & 0xf];
				1428	*p++ = hexdigit[(ch >> 8) & 0xf];
				1429	*p++ = hexdigit[(ch >> 4) & 0xf];
				1430	*p++ = hexdigit[ch & 15];
				1431	}
				1432	/* Map non-printable US ASCII to '\ooo' */
				1433	else if (ch < ' ' \|\| ch >= 128) {
				1434	*p++ = '\\';
				1435	*p++ = hexdigit[(ch >> 6) & 7];
				1436	*p++ = hexdigit[(ch >> 3) & 7];
				1437	*p++ = hexdigit[ch & 7];
				1438	}
				1439	/* Copy everything else as-is */
				1440	else
				1441	*p++ = (char) ch;
				1442	}
				1443	if (quotes)
				1444	*p++ = q[1];
				1445
				1446	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1447	if (_PyString_Resize(&repr, p - q))
				1448	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1449
				1450	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1451
				1452	onError:
				1453	Py_DECREF(repr);
				1454	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1455	}
				1456
				1457	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1458	int size)
				1459	{
				1460	return unicodeescape_string(s, size, 0);
				1461	}
				1462
				1463	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1464	{
				1465	if (!PyUnicode_Check(unicode)) {
				1466	PyErr_BadArgument();
				1467	return NULL;
				1468	}
				1469	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1470	PyUnicode_GET_SIZE(unicode));
				1471	}
				1472
				1473	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1474
				1475	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1476	int size,
				1477	const char *errors)
				1478	{
				1479	PyUnicodeObject *v;
				1480	Py_UNICODE p, buf;
				1481	const char *end;
				1482	const char *bs;
				1483
				1484	/* Escaped strings will always be longer than the resulting
				1485	Unicode string, so we start with size here and then reduce the
				1486	length after conversion to the true value. */
				1487	v = _PyUnicode_New(size);
				1488	if (v == NULL)
				1489	goto onError;
				1490	if (size == 0)
				1491	return (PyObject *)v;
				1492	p = buf = PyUnicode_AS_UNICODE(v);
				1493	end = s + size;
				1494	while (s < end) {
				1495	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1496	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1497	int i;
				1498
				1499	/* Non-escape characters are interpreted as Unicode ordinals */
				1500	if (*s != '\\') {
				1501	p++ = (unsigned char)s++;
				1502	continue;
				1503	}
				1504
				1505	/* \u-escapes are only interpreted iff the number of leading
				1506	backslashes if odd */
				1507	bs = s;
				1508	for (;s < end;) {
				1509	if (*s != '\\')
				1510	break;
				1511	p++ = (unsigned char)s++;
				1512	}
				1513	if (((s - bs) & 1) == 0 \|\|
				1514	s >= end \|\|
				1515	*s != 'u') {
				1516	continue;
				1517	}
				1518	p--;
				1519	s++;
				1520
				1521	/* \uXXXX with 4 hex digits */
				1522	for (x = 0, i = 0; i < 4; i++) {
				1523	c = (unsigned char)s[i];
				1524	if (!isxdigit(c)) {
				1525	if (unicodeescape_decoding_error(&s, &x, errors,
				1526	"truncated \\uXXXX"))
				1527	goto onError;
				1528	i++;
				1529	break;
				1530	}
				1531	x = (x<<4) & ~0xF;
				1532	if (c >= '0' && c <= '9')
				1533	x += c - '0';
				1534	else if (c >= 'a' && c <= 'f')
				1535	x += 10 + c - 'a';
				1536	else
				1537	x += 10 + c - 'A';
				1538	}
				1539	s += i;
				1540	*p++ = x;
				1541	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1542	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1543	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1544	return (PyObject *)v;
				1545
				1546	onError:
				1547	Py_XDECREF(v);
				1548	return NULL;
				1549	}
				1550
				1551	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1552	int size)
				1553	{
				1554	PyObject *repr;
				1555	char *p;
				1556	char *q;
				1557
				1558	static const char *hexdigit = "0123456789ABCDEF";
				1559
				1560	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1561	if (repr == NULL)
				1562	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	1563	if (size == 0)
				1564	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1565
				1566	p = q = PyString_AS_STRING(repr);
				1567	while (size-- > 0) {
				1568	Py_UNICODE ch = *s++;
				1569	/* Map 16-bit characters to '\uxxxx' */
				1570	if (ch >= 256) {
				1571	*p++ = '\\';
				1572	*p++ = 'u';
				1573	*p++ = hexdigit[(ch >> 12) & 0xf];
				1574	*p++ = hexdigit[(ch >> 8) & 0xf];
				1575	*p++ = hexdigit[(ch >> 4) & 0xf];
				1576	*p++ = hexdigit[ch & 15];
				1577	}
				1578	/* Copy everything else as-is */
				1579	else
				1580	*p++ = (char) ch;
				1581	}
				1582	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1583	if (_PyString_Resize(&repr, p - q))
				1584	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1585
				1586	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1587
				1588	onError:
				1589	Py_DECREF(repr);
				1590	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1591	}
				1592
				1593	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1594	{
				1595	if (!PyUnicode_Check(unicode)) {
				1596	PyErr_BadArgument();
				1597	return NULL;
				1598	}
				1599	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1600	PyUnicode_GET_SIZE(unicode));
				1601	}
				1602
				1603	/* --- Latin-1 Codec ------------------------------------------------------ */
				1604
				1605	PyObject PyUnicode_DecodeLatin1(const char s,
				1606	int size,
				1607	const char *errors)
				1608	{
				1609	PyUnicodeObject *v;
				1610	Py_UNICODE *p;
				1611
				1612	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1613	v = _PyUnicode_New(size);
				1614	if (v == NULL)
				1615	goto onError;
				1616	if (size == 0)
				1617	return (PyObject *)v;
				1618	p = PyUnicode_AS_UNICODE(v);
				1619	while (size-- > 0)
				1620	p++ = (unsigned char)s++;
				1621	return (PyObject *)v;
				1622
				1623	onError:
				1624	Py_XDECREF(v);
				1625	return NULL;
				1626	}
				1627
				1628	static
				1629	int latin1_encoding_error(const Py_UNICODE **source,
				1630	char **dest,
				1631	const char *errors,
				1632	const char *details)
				1633	{
				1634	if ((errors == NULL) \|\|
				1635	(strcmp(errors,"strict") == 0)) {
				1636	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1637	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1638	details);
				1639	return -1;
				1640	}
				1641	else if (strcmp(errors,"ignore") == 0) {
				1642	return 0;
				1643	}
				1644	else if (strcmp(errors,"replace") == 0) {
				1645	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1646	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1647	return 0;
				1648	}
				1649	else {
				1650	PyErr_Format(PyExc_ValueError,
				1651	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1652	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1653	errors);
				1654	return -1;
				1655	}
				1656	}
				1657
				1658	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1659	int size,
				1660	const char *errors)
				1661	{
				1662	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1663	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	1664
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1665	repr = PyString_FromStringAndSize(NULL, size);
				1666	if (repr == NULL)
				1667	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	1668	if (size == 0)
				1669	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1670
				1671	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1672	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1673	while (size-- > 0) {
				1674	Py_UNICODE ch = *p++;
				1675	if (ch >= 256) {
				1676	if (latin1_encoding_error(&p, &s, errors,
				1677	"ordinal not in range(256)"))
				1678	goto onError;
				1679	}
				1680	else
				1681	*s++ = (char)ch;
				1682	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1683	/* Resize if error handling skipped some characters */
				1684	if (s - start < PyString_GET_SIZE(repr))
				1685	if (_PyString_Resize(&repr, s - start))
				1686	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1687	return repr;
				1688
				1689	onError:
				1690	Py_DECREF(repr);
				1691	return NULL;
				1692	}
				1693
				1694	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1695	{
				1696	if (!PyUnicode_Check(unicode)) {
				1697	PyErr_BadArgument();
				1698	return NULL;
				1699	}
				1700	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1701	PyUnicode_GET_SIZE(unicode),
				1702	NULL);
				1703	}
				1704
				1705	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1706
				1707	static
				1708	int ascii_decoding_error(const char **source,
				1709	Py_UNICODE **dest,
				1710	const char *errors,
				1711	const char *details)
				1712	{
				1713	if ((errors == NULL) \|\|
				1714	(strcmp(errors,"strict") == 0)) {
				1715	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1716	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1717	details);
				1718	return -1;
				1719	}
				1720	else if (strcmp(errors,"ignore") == 0) {
				1721	return 0;
				1722	}
				1723	else if (strcmp(errors,"replace") == 0) {
				1724	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1725	(*dest)++;
				1726	return 0;
				1727	}
				1728	else {
				1729	PyErr_Format(PyExc_ValueError,
				1730	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1731	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1732	errors);
				1733	return -1;
				1734	}
				1735	}
				1736
				1737	PyObject PyUnicode_DecodeASCII(const char s,
				1738	int size,
				1739	const char *errors)
				1740	{
				1741	PyUnicodeObject *v;
				1742	Py_UNICODE *p;
				1743
				1744	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1745	v = _PyUnicode_New(size);
				1746	if (v == NULL)
				1747	goto onError;
				1748	if (size == 0)
				1749	return (PyObject *)v;
				1750	p = PyUnicode_AS_UNICODE(v);
				1751	while (size-- > 0) {
				1752	register unsigned char c;
				1753
				1754	c = (unsigned char)*s++;
				1755	if (c < 128)
				1756	*p++ = c;
				1757	else if (ascii_decoding_error(&s, &p, errors,
				1758	"ordinal not in range(128)"))
				1759	goto onError;
				1760	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1761	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1762	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1763	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1764	return (PyObject *)v;
				1765
				1766	onError:
				1767	Py_XDECREF(v);
				1768	return NULL;
				1769	}
				1770
				1771	static
				1772	int ascii_encoding_error(const Py_UNICODE **source,
				1773	char **dest,
				1774	const char *errors,
				1775	const char *details)
				1776	{
				1777	if ((errors == NULL) \|\|
				1778	(strcmp(errors,"strict") == 0)) {
				1779	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1780	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1781	details);
				1782	return -1;
				1783	}
				1784	else if (strcmp(errors,"ignore") == 0) {
				1785	return 0;
				1786	}
				1787	else if (strcmp(errors,"replace") == 0) {
				1788	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1789	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1790	return 0;
				1791	}
				1792	else {
				1793	PyErr_Format(PyExc_ValueError,
				1794	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1795	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1796	errors);
				1797	return -1;
				1798	}
				1799	}
				1800
				1801	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1802	int size,
				1803	const char *errors)
				1804	{
				1805	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1806	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	1807
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1808	repr = PyString_FromStringAndSize(NULL, size);
				1809	if (repr == NULL)
				1810	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	1811	if (size == 0)
				1812	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1813
				1814	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1815	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1816	while (size-- > 0) {
				1817	Py_UNICODE ch = *p++;
				1818	if (ch >= 128) {
				1819	if (ascii_encoding_error(&p, &s, errors,
				1820	"ordinal not in range(128)"))
				1821	goto onError;
				1822	}
				1823	else
				1824	*s++ = (char)ch;
				1825	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1826	/* Resize if error handling skipped some characters */
				1827	if (s - start < PyString_GET_SIZE(repr))
				1828	if (_PyString_Resize(&repr, s - start))
				1829	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1830	return repr;
				1831
				1832	onError:
				1833	Py_DECREF(repr);
				1834	return NULL;
				1835	}
				1836
				1837	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1838	{
				1839	if (!PyUnicode_Check(unicode)) {
				1840	PyErr_BadArgument();
				1841	return NULL;
				1842	}
				1843	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1844	PyUnicode_GET_SIZE(unicode),
				1845	NULL);
				1846	}
				1847
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1848	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1849
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1850	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1851
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1852	PyObject PyUnicode_DecodeMBCS(const char s,
				1853	int size,
				1854	const char *errors)
				1855	{
				1856	PyUnicodeObject *v;
				1857	Py_UNICODE *p;
				1858
				1859	/* First get the size of the result */
				1860	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1861	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1862	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1863
				1864	v = _PyUnicode_New(usize);
				1865	if (v == NULL)
				1866	return NULL;
				1867	if (usize == 0)
				1868	return (PyObject *)v;
				1869	p = PyUnicode_AS_UNICODE(v);
				1870	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1871	Py_DECREF(v);
				1872	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1873	}
				1874
				1875	return (PyObject *)v;
				1876	}
				1877
				1878	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1879	int size,
				1880	const char *errors)
				1881	{
				1882	PyObject *repr;
				1883	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1884	DWORD mbcssize;
				1885
				1886	/* If there are no characters, bail now! */
				1887	if (size==0)
				1888	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1889
				1890	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1891	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1892	if (mbcssize==0)
				1893	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1894
				1895	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1896	if (repr == NULL)
				1897	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	1898	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1899	return repr;
				1900
				1901	/* Do the conversion */
				1902	s = PyString_AS_STRING(repr);
				1903	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1904	Py_DECREF(repr);
				1905	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1906	}
				1907	return repr;
				1908	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1909
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1910	#endif /* MS_WIN32 */
				1911
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1912	/* --- Character Mapping Codec -------------------------------------------- */
				1913
				1914	static
				1915	int charmap_decoding_error(const char **source,
				1916	Py_UNICODE **dest,
				1917	const char *errors,
				1918	const char *details)
				1919	{
				1920	if ((errors == NULL) \|\|
				1921	(strcmp(errors,"strict") == 0)) {
				1922	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1923	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1924	details);
				1925	return -1;
				1926	}
				1927	else if (strcmp(errors,"ignore") == 0) {
				1928	return 0;
				1929	}
				1930	else if (strcmp(errors,"replace") == 0) {
				1931	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1932	(*dest)++;
				1933	return 0;
				1934	}
				1935	else {
				1936	PyErr_Format(PyExc_ValueError,
				1937	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1938	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1939	errors);
				1940	return -1;
				1941	}
				1942	}
				1943
				1944	PyObject PyUnicode_DecodeCharmap(const char s,
				1945	int size,
				1946	PyObject *mapping,
				1947	const char *errors)
				1948	{
				1949	PyUnicodeObject *v;
				1950	Py_UNICODE *p;
				1951
				1952	/* Default to Latin-1 */
				1953	if (mapping == NULL)
				1954	return PyUnicode_DecodeLatin1(s, size, errors);
				1955
				1956	v = _PyUnicode_New(size);
				1957	if (v == NULL)
				1958	goto onError;
				1959	if (size == 0)
				1960	return (PyObject *)v;
				1961	p = PyUnicode_AS_UNICODE(v);
				1962	while (size-- > 0) {
				1963	unsigned char ch = *s++;
				1964	PyObject w, x;
				1965
				1966	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1967	w = PyInt_FromLong((long)ch);
				1968	if (w == NULL)
				1969	goto onError;
				1970	x = PyObject_GetItem(mapping, w);
				1971	Py_DECREF(w);
				1972	if (x == NULL) {
				1973	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1974	/* No mapping found: default to Latin-1 mapping */
				1975	PyErr_Clear();
				1976	*p++ = (Py_UNICODE)ch;
				1977	continue;
				1978	}
				1979	goto onError;
				1980	}
				1981
				1982	/* Apply mapping */
				1983	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1984	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1985	if (value < 0 \|\| value > 65535) {
				1986	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1987	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1988	Py_DECREF(x);
				1989	goto onError;
				1990	}
				1991	*p++ = (Py_UNICODE)value;
				1992	}
				1993	else if (x == Py_None) {
				1994	/* undefined mapping */
				1995	if (charmap_decoding_error(&s, &p, errors,
				1996	"character maps to <undefined>")) {
				1997	Py_DECREF(x);
				1998	goto onError;
				1999	}
				2000	}
				2001	else if (PyUnicode_Check(x)) {
				2002	if (PyUnicode_GET_SIZE(x) != 1) {
				2003	/* 1-n mapping */
				2004	PyErr_SetString(PyExc_NotImplementedError,
				2005	"1-n mappings are currently not implemented");
				2006	Py_DECREF(x);
				2007	goto onError;
				2008	}
				2009	p++ = PyUnicode_AS_UNICODE(x);
				2010	}
				2011	else {
				2012	/* wrong return value */
				2013	PyErr_SetString(PyExc_TypeError,
				2014	"character mapping must return integer, None or unicode");
				2015	Py_DECREF(x);
				2016	goto onError;
				2017	}
				2018	Py_DECREF(x);
				2019	}
				2020	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2021	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2022	goto onError;
				2023	return (PyObject *)v;
				2024
				2025	onError:
				2026	Py_XDECREF(v);
				2027	return NULL;
				2028	}
				2029
				2030	static
				2031	int charmap_encoding_error(const Py_UNICODE **source,
				2032	char **dest,
				2033	const char *errors,
				2034	const char *details)
				2035	{
				2036	if ((errors == NULL) \|\|
				2037	(strcmp(errors,"strict") == 0)) {
				2038	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2039	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2040	details);
				2041	return -1;
				2042	}
				2043	else if (strcmp(errors,"ignore") == 0) {
				2044	return 0;
				2045	}
				2046	else if (strcmp(errors,"replace") == 0) {
				2047	**dest = '?';
				2048	(*dest)++;
				2049	return 0;
				2050	}
				2051	else {
				2052	PyErr_Format(PyExc_ValueError,
				2053	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2054	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2055	errors);
				2056	return -1;
				2057	}
				2058	}
				2059
				2060	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2061	int size,
				2062	PyObject *mapping,
				2063	const char *errors)
				2064	{
				2065	PyObject *v;
				2066	char *s;
				2067
				2068	/* Default to Latin-1 */
				2069	if (mapping == NULL)
				2070	return PyUnicode_EncodeLatin1(p, size, errors);
				2071
				2072	v = PyString_FromStringAndSize(NULL, size);
				2073	if (v == NULL)
				2074	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame^]	2075	if (size == 0)
				2076	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2077	s = PyString_AS_STRING(v);
				2078	while (size-- > 0) {
				2079	Py_UNICODE ch = *p++;
				2080	PyObject w, x;
				2081
				2082	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2083	w = PyInt_FromLong((long)ch);
				2084	if (w == NULL)
				2085	goto onError;
				2086	x = PyObject_GetItem(mapping, w);
				2087	Py_DECREF(w);
				2088	if (x == NULL) {
				2089	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2090	/* No mapping found: default to Latin-1 mapping if possible */
				2091	PyErr_Clear();
				2092	if (ch < 256) {
				2093	*s++ = (char)ch;
				2094	continue;
				2095	}
				2096	else if (!charmap_encoding_error(&p, &s, errors,
				2097	"missing character mapping"))
				2098	continue;
				2099	}
				2100	goto onError;
				2101	}
				2102
				2103	/* Apply mapping */
				2104	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2105	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2106	if (value < 0 \|\| value > 255) {
				2107	PyErr_SetString(PyExc_TypeError,
				2108	"character mapping must be in range(256)");
				2109	Py_DECREF(x);
				2110	goto onError;
				2111	}
				2112	*s++ = (char)value;
				2113	}
				2114	else if (x == Py_None) {
				2115	/* undefined mapping */
				2116	if (charmap_encoding_error(&p, &s, errors,
				2117	"character maps to <undefined>")) {
				2118	Py_DECREF(x);
				2119	goto onError;
				2120	}
				2121	}
				2122	else if (PyString_Check(x)) {
				2123	if (PyString_GET_SIZE(x) != 1) {
				2124	/* 1-n mapping */
				2125	PyErr_SetString(PyExc_NotImplementedError,
				2126	"1-n mappings are currently not implemented");
				2127	Py_DECREF(x);
				2128	goto onError;
				2129	}
				2130	s++ = PyString_AS_STRING(x);
				2131	}
				2132	else {
				2133	/* wrong return value */
				2134	PyErr_SetString(PyExc_TypeError,
				2135	"character mapping must return integer, None or unicode");
				2136	Py_DECREF(x);
				2137	goto onError;
				2138	}
				2139	Py_DECREF(x);
				2140	}
				2141	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2142	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2143	goto onError;
				2144	return v;
				2145
				2146	onError:
				2147	Py_DECREF(v);
				2148	return NULL;
				2149	}
				2150
				2151	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2152	PyObject *mapping)
				2153	{
				2154	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2155	PyErr_BadArgument();
				2156	return NULL;
				2157	}
				2158	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2159	PyUnicode_GET_SIZE(unicode),
				2160	mapping,
				2161	NULL);
				2162	}
				2163
				2164	static
				2165	int translate_error(const Py_UNICODE **source,
				2166	Py_UNICODE **dest,
				2167	const char *errors,
				2168	const char *details)
				2169	{
				2170	if ((errors == NULL) \|\|
				2171	(strcmp(errors,"strict") == 0)) {
				2172	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2173	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2174	details);
				2175	return -1;
				2176	}
				2177	else if (strcmp(errors,"ignore") == 0) {
				2178	return 0;
				2179	}
				2180	else if (strcmp(errors,"replace") == 0) {
				2181	**dest = '?';
				2182	(*dest)++;
				2183	return 0;
				2184	}
				2185	else {
				2186	PyErr_Format(PyExc_ValueError,
				2187	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2188	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2189	errors);
				2190	return -1;
				2191	}
				2192	}
				2193
				2194	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2195	int size,
				2196	PyObject *mapping,
				2197	const char *errors)
				2198	{
				2199	PyUnicodeObject *v;
				2200	Py_UNICODE *p;
				2201
				2202	if (mapping == NULL) {
				2203	PyErr_BadArgument();
				2204	return NULL;
				2205	}
				2206
				2207	/* Output will never be longer than input */
				2208	v = _PyUnicode_New(size);
				2209	if (v == NULL)
				2210	goto onError;
				2211	if (size == 0)
				2212	goto done;
				2213	p = PyUnicode_AS_UNICODE(v);
				2214	while (size-- > 0) {
				2215	Py_UNICODE ch = *s++;
				2216	PyObject w, x;
				2217
				2218	/* Get mapping */
				2219	w = PyInt_FromLong(ch);
				2220	if (w == NULL)
				2221	goto onError;
				2222	x = PyObject_GetItem(mapping, w);
				2223	Py_DECREF(w);
				2224	if (x == NULL) {
				2225	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2226	/* No mapping found: default to 1-1 mapping */
				2227	PyErr_Clear();
				2228	*p++ = ch;
				2229	continue;
				2230	}
				2231	goto onError;
				2232	}
				2233
				2234	/* Apply mapping */
				2235	if (PyInt_Check(x))
				2236	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2237	else if (x == Py_None) {
				2238	/* undefined mapping */
				2239	if (translate_error(&s, &p, errors,
				2240	"character maps to <undefined>")) {
				2241	Py_DECREF(x);
				2242	goto onError;
				2243	}
				2244	}
				2245	else if (PyUnicode_Check(x)) {
				2246	if (PyUnicode_GET_SIZE(x) != 1) {
				2247	/* 1-n mapping */
				2248	PyErr_SetString(PyExc_NotImplementedError,
				2249	"1-n mappings are currently not implemented");
				2250	Py_DECREF(x);
				2251	goto onError;
				2252	}
				2253	p++ = PyUnicode_AS_UNICODE(x);
				2254	}
				2255	else {
				2256	/* wrong return value */
				2257	PyErr_SetString(PyExc_TypeError,
				2258	"translate mapping must return integer, None or unicode");
				2259	Py_DECREF(x);
				2260	goto onError;
				2261	}
				2262	Py_DECREF(x);
				2263	}
				2264	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2265	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2266	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2267
				2268	done:
				2269	return (PyObject *)v;
				2270
				2271	onError:
				2272	Py_XDECREF(v);
				2273	return NULL;
				2274	}
				2275
				2276	PyObject PyUnicode_Translate(PyObject str,
				2277	PyObject *mapping,
				2278	const char *errors)
				2279	{
				2280	PyObject *result;
				2281
				2282	str = PyUnicode_FromObject(str);
				2283	if (str == NULL)
				2284	goto onError;
				2285	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2286	PyUnicode_GET_SIZE(str),
				2287	mapping,
				2288	errors);
				2289	Py_DECREF(str);
				2290	return result;
				2291
				2292	onError:
				2293	Py_XDECREF(str);
				2294	return NULL;
				2295	}
				2296
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2297	/* --- Decimal Encoder ---------------------------------------------------- */
				2298
				2299	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2300	int length,
				2301	char *output,
				2302	const char *errors)
				2303	{
				2304	Py_UNICODE p, end;
				2305
				2306	if (output == NULL) {
				2307	PyErr_BadArgument();
				2308	return -1;
				2309	}
				2310
				2311	p = s;
				2312	end = s + length;
				2313	while (p < end) {
				2314	register Py_UNICODE ch = *p++;
				2315	int decimal;
				2316
				2317	if (Py_UNICODE_ISSPACE(ch)) {
				2318	*output++ = ' ';
				2319	continue;
				2320	}
				2321	decimal = Py_UNICODE_TODECIMAL(ch);
				2322	if (decimal >= 0) {
				2323	*output++ = '0' + decimal;
				2324	continue;
				2325	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2326	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2327	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2328	continue;
				2329	}
				2330	/* All other characters are considered invalid */
				2331	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2332	PyErr_SetString(PyExc_ValueError,
				2333	"invalid decimal Unicode string");
				2334	goto onError;
				2335	}
				2336	else if (strcmp(errors, "ignore") == 0)
				2337	continue;
				2338	else if (strcmp(errors, "replace") == 0) {
				2339	*output++ = '?';
				2340	continue;
				2341	}
				2342	}
				2343	/* 0-terminate the output string */
				2344	*output++ = '\0';
				2345	return 0;
				2346
				2347	onError:
				2348	return -1;
				2349	}
				2350
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2351	/* --- Helpers ------------------------------------------------------------ */
				2352
				2353	static
				2354	int count(PyUnicodeObject *self,
				2355	int start,
				2356	int end,
				2357	PyUnicodeObject *substring)
				2358	{
				2359	int count = 0;
				2360
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2361	if (substring->length == 0)
				2362	return (end - start + 1);
				2363
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2364	end -= substring->length;
				2365
				2366	while (start <= end)
				2367	if (Py_UNICODE_MATCH(self, start, substring)) {
				2368	count++;
				2369	start += substring->length;
				2370	} else
				2371	start++;
				2372
				2373	return count;
				2374	}
				2375
				2376	int PyUnicode_Count(PyObject *str,
				2377	PyObject *substr,
				2378	int start,
				2379	int end)
				2380	{
				2381	int result;
				2382
				2383	str = PyUnicode_FromObject(str);
				2384	if (str == NULL)
				2385	return -1;
				2386	substr = PyUnicode_FromObject(substr);
				2387	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2388	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2389	return -1;
				2390	}
				2391
				2392	result = count((PyUnicodeObject *)str,
				2393	start, end,
				2394	(PyUnicodeObject *)substr);
				2395
				2396	Py_DECREF(str);
				2397	Py_DECREF(substr);
				2398	return result;
				2399	}
				2400
				2401	static
				2402	int findstring(PyUnicodeObject *self,
				2403	PyUnicodeObject *substring,
				2404	int start,
				2405	int end,
				2406	int direction)
				2407	{
				2408	if (start < 0)
				2409	start += self->length;
				2410	if (start < 0)
				2411	start = 0;
				2412
				2413	if (substring->length == 0)
				2414	return start;
				2415
				2416	if (end > self->length)
				2417	end = self->length;
				2418	if (end < 0)
				2419	end += self->length;
				2420	if (end < 0)
				2421	end = 0;
				2422
				2423	end -= substring->length;
				2424
				2425	if (direction < 0) {
				2426	for (; end >= start; end--)
				2427	if (Py_UNICODE_MATCH(self, end, substring))
				2428	return end;
				2429	} else {
				2430	for (; start <= end; start++)
				2431	if (Py_UNICODE_MATCH(self, start, substring))
				2432	return start;
				2433	}
				2434
				2435	return -1;
				2436	}
				2437
				2438	int PyUnicode_Find(PyObject *str,
				2439	PyObject *substr,
				2440	int start,
				2441	int end,
				2442	int direction)
				2443	{
				2444	int result;
				2445
				2446	str = PyUnicode_FromObject(str);
				2447	if (str == NULL)
				2448	return -1;
				2449	substr = PyUnicode_FromObject(substr);
				2450	if (substr == NULL) {
				2451	Py_DECREF(substr);
				2452	return -1;
				2453	}
				2454
				2455	result = findstring((PyUnicodeObject *)str,
				2456	(PyUnicodeObject *)substr,
				2457	start, end, direction);
				2458	Py_DECREF(str);
				2459	Py_DECREF(substr);
				2460	return result;
				2461	}
				2462
				2463	static
				2464	int tailmatch(PyUnicodeObject *self,
				2465	PyUnicodeObject *substring,
				2466	int start,
				2467	int end,
				2468	int direction)
				2469	{
				2470	if (start < 0)
				2471	start += self->length;
				2472	if (start < 0)
				2473	start = 0;
				2474
				2475	if (substring->length == 0)
				2476	return 1;
				2477
				2478	if (end > self->length)
				2479	end = self->length;
				2480	if (end < 0)
				2481	end += self->length;
				2482	if (end < 0)
				2483	end = 0;
				2484
				2485	end -= substring->length;
				2486	if (end < start)
				2487	return 0;
				2488
				2489	if (direction > 0) {
				2490	if (Py_UNICODE_MATCH(self, end, substring))
				2491	return 1;
				2492	} else {
				2493	if (Py_UNICODE_MATCH(self, start, substring))
				2494	return 1;
				2495	}
				2496
				2497	return 0;
				2498	}
				2499
				2500	int PyUnicode_Tailmatch(PyObject *str,
				2501	PyObject *substr,
				2502	int start,
				2503	int end,
				2504	int direction)
				2505	{
				2506	int result;
				2507
				2508	str = PyUnicode_FromObject(str);
				2509	if (str == NULL)
				2510	return -1;
				2511	substr = PyUnicode_FromObject(substr);
				2512	if (substr == NULL) {
				2513	Py_DECREF(substr);
				2514	return -1;
				2515	}
				2516
				2517	result = tailmatch((PyUnicodeObject *)str,
				2518	(PyUnicodeObject *)substr,
				2519	start, end, direction);
				2520	Py_DECREF(str);
				2521	Py_DECREF(substr);
				2522	return result;
				2523	}
				2524
				2525	static
				2526	const Py_UNICODE findchar(const Py_UNICODE s,
				2527	int size,
				2528	Py_UNICODE ch)
				2529	{
				2530	/* like wcschr, but doesn't stop at NULL characters */
				2531
				2532	while (size-- > 0) {
				2533	if (*s == ch)
				2534	return s;
				2535	s++;
				2536	}
				2537
				2538	return NULL;
				2539	}
				2540
				2541	/* Apply fixfct filter to the Unicode object self and return a
				2542	reference to the modified object */
				2543
				2544	static
				2545	PyObject fixup(PyUnicodeObject self,
				2546	int (fixfct)(PyUnicodeObject s))
				2547	{
				2548
				2549	PyUnicodeObject *u;
				2550
				2551	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2552	self->length);
				2553	if (u == NULL)
				2554	return NULL;
				2555	if (!fixfct(u)) {
				2556	/* fixfct should return TRUE if it modified the buffer. If
				2557	FALSE, return a reference to the original buffer instead
				2558	(to save space, not time) */
				2559	Py_INCREF(self);
				2560	Py_DECREF(u);
				2561	return (PyObject*) self;
				2562	}
				2563	return (PyObject*) u;
				2564	}
				2565
				2566	static
				2567	int fixupper(PyUnicodeObject *self)
				2568	{
				2569	int len = self->length;
				2570	Py_UNICODE *s = self->str;
				2571	int status = 0;
				2572
				2573	while (len-- > 0) {
				2574	register Py_UNICODE ch;
				2575
				2576	ch = Py_UNICODE_TOUPPER(*s);
				2577	if (ch != *s) {
				2578	status = 1;
				2579	*s = ch;
				2580	}
				2581	s++;
				2582	}
				2583
				2584	return status;
				2585	}
				2586
				2587	static
				2588	int fixlower(PyUnicodeObject *self)
				2589	{
				2590	int len = self->length;
				2591	Py_UNICODE *s = self->str;
				2592	int status = 0;
				2593
				2594	while (len-- > 0) {
				2595	register Py_UNICODE ch;
				2596
				2597	ch = Py_UNICODE_TOLOWER(*s);
				2598	if (ch != *s) {
				2599	status = 1;
				2600	*s = ch;
				2601	}
				2602	s++;
				2603	}
				2604
				2605	return status;
				2606	}
				2607
				2608	static
				2609	int fixswapcase(PyUnicodeObject *self)
				2610	{
				2611	int len = self->length;
				2612	Py_UNICODE *s = self->str;
				2613	int status = 0;
				2614
				2615	while (len-- > 0) {
				2616	if (Py_UNICODE_ISUPPER(*s)) {
				2617	s = Py_UNICODE_TOLOWER(s);
				2618	status = 1;
				2619	} else if (Py_UNICODE_ISLOWER(*s)) {
				2620	s = Py_UNICODE_TOUPPER(s);
				2621	status = 1;
				2622	}
				2623	s++;
				2624	}
				2625
				2626	return status;
				2627	}
				2628
				2629	static
				2630	int fixcapitalize(PyUnicodeObject *self)
				2631	{
				2632	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2633	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2634	return 1;
				2635	}
				2636	return 0;
				2637	}
				2638
				2639	static
				2640	int fixtitle(PyUnicodeObject *self)
				2641	{
				2642	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2643	register Py_UNICODE *e;
				2644	int previous_is_cased;
				2645
				2646	/* Shortcut for single character strings */
				2647	if (PyUnicode_GET_SIZE(self) == 1) {
				2648	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2649	if (*p != ch) {
				2650	*p = ch;
				2651	return 1;
				2652	}
				2653	else
				2654	return 0;
				2655	}
				2656
				2657	e = p + PyUnicode_GET_SIZE(self);
				2658	previous_is_cased = 0;
				2659	for (; p < e; p++) {
				2660	register const Py_UNICODE ch = *p;
				2661
				2662	if (previous_is_cased)
				2663	*p = Py_UNICODE_TOLOWER(ch);
				2664	else
				2665	*p = Py_UNICODE_TOTITLE(ch);
				2666
				2667	if (Py_UNICODE_ISLOWER(ch) \|\|
				2668	Py_UNICODE_ISUPPER(ch) \|\|
				2669	Py_UNICODE_ISTITLE(ch))
				2670	previous_is_cased = 1;
				2671	else
				2672	previous_is_cased = 0;
				2673	}
				2674	return 1;
				2675	}
				2676
				2677	PyObject PyUnicode_Join(PyObject separator,
				2678	PyObject *seq)
				2679	{
				2680	Py_UNICODE *sep;
				2681	int seplen;
				2682	PyUnicodeObject *res = NULL;
				2683	int reslen = 0;
				2684	Py_UNICODE *p;
				2685	int seqlen = 0;
				2686	int sz = 100;
				2687	int i;
				2688
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2689	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2690	if (seqlen < 0 && PyErr_Occurred())
				2691	return NULL;
				2692
				2693	if (separator == NULL) {
				2694	Py_UNICODE blank = ' ';
				2695	sep = &blank;
				2696	seplen = 1;
				2697	}
				2698	else {
				2699	separator = PyUnicode_FromObject(separator);
				2700	if (separator == NULL)
				2701	return NULL;
				2702	sep = PyUnicode_AS_UNICODE(separator);
				2703	seplen = PyUnicode_GET_SIZE(separator);
				2704	}
				2705
				2706	res = _PyUnicode_New(sz);
				2707	if (res == NULL)
				2708	goto onError;
				2709	p = PyUnicode_AS_UNICODE(res);
				2710	reslen = 0;
				2711
				2712	for (i = 0; i < seqlen; i++) {
				2713	int itemlen;
				2714	PyObject *item;
				2715
				2716	item = PySequence_GetItem(seq, i);
				2717	if (item == NULL)
				2718	goto onError;
				2719	if (!PyUnicode_Check(item)) {
				2720	PyObject *v;
				2721	v = PyUnicode_FromObject(item);
				2722	Py_DECREF(item);
				2723	item = v;
				2724	if (item == NULL)
				2725	goto onError;
				2726	}
				2727	itemlen = PyUnicode_GET_SIZE(item);
				2728	while (reslen + itemlen + seplen >= sz) {
				2729	if (_PyUnicode_Resize(res, sz*2))
				2730	goto onError;
				2731	sz *= 2;
				2732	p = PyUnicode_AS_UNICODE(res) + reslen;
				2733	}
				2734	if (i > 0) {
				2735	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2736	p += seplen;
				2737	reslen += seplen;
				2738	}
				2739	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2740	p += itemlen;
				2741	reslen += itemlen;
				2742	Py_DECREF(item);
				2743	}
				2744	if (_PyUnicode_Resize(res, reslen))
				2745	goto onError;
				2746
				2747	Py_XDECREF(separator);
				2748	return (PyObject *)res;
				2749
				2750	onError:
				2751	Py_XDECREF(separator);
				2752	Py_DECREF(res);
				2753	return NULL;
				2754	}
				2755
				2756	static
				2757	PyUnicodeObject pad(PyUnicodeObject self,
				2758	int left,
				2759	int right,
				2760	Py_UNICODE fill)
				2761	{
				2762	PyUnicodeObject *u;
				2763
				2764	if (left < 0)
				2765	left = 0;
				2766	if (right < 0)
				2767	right = 0;
				2768
				2769	if (left == 0 && right == 0) {
				2770	Py_INCREF(self);
				2771	return self;
				2772	}
				2773
				2774	u = _PyUnicode_New(left + self->length + right);
				2775	if (u) {
				2776	if (left)
				2777	Py_UNICODE_FILL(u->str, fill, left);
				2778	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2779	if (right)
				2780	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2781	}
				2782
				2783	return u;
				2784	}
				2785
				2786	#define SPLIT_APPEND(data, left, right) \
				2787	str = PyUnicode_FromUnicode(data + left, right - left); \
				2788	if (!str) \
				2789	goto onError; \
				2790	if (PyList_Append(list, str)) { \
				2791	Py_DECREF(str); \
				2792	goto onError; \
				2793	} \
				2794	else \
				2795	Py_DECREF(str);
				2796
				2797	static
				2798	PyObject split_whitespace(PyUnicodeObject self,
				2799	PyObject *list,
				2800	int maxcount)
				2801	{
				2802	register int i;
				2803	register int j;
				2804	int len = self->length;
				2805	PyObject *str;
				2806
				2807	for (i = j = 0; i < len; ) {
				2808	/* find a token */
				2809	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2810	i++;
				2811	j = i;
				2812	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2813	i++;
				2814	if (j < i) {
				2815	if (maxcount-- <= 0)
				2816	break;
				2817	SPLIT_APPEND(self->str, j, i);
				2818	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2819	i++;
				2820	j = i;
				2821	}
				2822	}
				2823	if (j < len) {
				2824	SPLIT_APPEND(self->str, j, len);
				2825	}
				2826	return list;
				2827
				2828	onError:
				2829	Py_DECREF(list);
				2830	return NULL;
				2831	}
				2832
				2833	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2834	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2835	{
				2836	register int i;
				2837	register int j;
				2838	int len;
				2839	PyObject *list;
				2840	PyObject *str;
				2841	Py_UNICODE *data;
				2842
				2843	string = PyUnicode_FromObject(string);
				2844	if (string == NULL)
				2845	return NULL;
				2846	data = PyUnicode_AS_UNICODE(string);
				2847	len = PyUnicode_GET_SIZE(string);
				2848
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2849	list = PyList_New(0);
				2850	if (!list)
				2851	goto onError;
				2852
				2853	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2854	int eol;
				2855
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2856	/* Find a line and append it */
				2857	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2858	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2859
				2860	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2861	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2862	if (i < len) {
				2863	if (data[i] == '\r' && i + 1 < len &&
				2864	data[i+1] == '\n')
				2865	i += 2;
				2866	else
				2867	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2868	if (keepends)
				2869	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2870	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2871	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2872	j = i;
				2873	}
				2874	if (j < len) {
				2875	SPLIT_APPEND(data, j, len);
				2876	}
				2877
				2878	Py_DECREF(string);
				2879	return list;
				2880
				2881	onError:
				2882	Py_DECREF(list);
				2883	Py_DECREF(string);
				2884	return NULL;
				2885	}
				2886
				2887	static
				2888	PyObject split_char(PyUnicodeObject self,
				2889	PyObject *list,
				2890	Py_UNICODE ch,
				2891	int maxcount)
				2892	{
				2893	register int i;
				2894	register int j;
				2895	int len = self->length;
				2896	PyObject *str;
				2897
				2898	for (i = j = 0; i < len; ) {
				2899	if (self->str[i] == ch) {
				2900	if (maxcount-- <= 0)
				2901	break;
				2902	SPLIT_APPEND(self->str, j, i);
				2903	i = j = i + 1;
				2904	} else
				2905	i++;
				2906	}
				2907	if (j <= len) {
				2908	SPLIT_APPEND(self->str, j, len);
				2909	}
				2910	return list;
				2911
				2912	onError:
				2913	Py_DECREF(list);
				2914	return NULL;
				2915	}
				2916
				2917	static
				2918	PyObject split_substring(PyUnicodeObject self,
				2919	PyObject *list,
				2920	PyUnicodeObject *substring,
				2921	int maxcount)
				2922	{
				2923	register int i;
				2924	register int j;
				2925	int len = self->length;
				2926	int sublen = substring->length;
				2927	PyObject *str;
				2928
				2929	for (i = j = 0; i < len - sublen; ) {
				2930	if (Py_UNICODE_MATCH(self, i, substring)) {
				2931	if (maxcount-- <= 0)
				2932	break;
				2933	SPLIT_APPEND(self->str, j, i);
				2934	i = j = i + sublen;
				2935	} else
				2936	i++;
				2937	}
				2938	if (j <= len) {
				2939	SPLIT_APPEND(self->str, j, len);
				2940	}
				2941	return list;
				2942
				2943	onError:
				2944	Py_DECREF(list);
				2945	return NULL;
				2946	}
				2947
				2948	#undef SPLIT_APPEND
				2949
				2950	static
				2951	PyObject split(PyUnicodeObject self,
				2952	PyUnicodeObject *substring,
				2953	int maxcount)
				2954	{
				2955	PyObject *list;
				2956
				2957	if (maxcount < 0)
				2958	maxcount = INT_MAX;
				2959
				2960	list = PyList_New(0);
				2961	if (!list)
				2962	return NULL;
				2963
				2964	if (substring == NULL)
				2965	return split_whitespace(self,list,maxcount);
				2966
				2967	else if (substring->length == 1)
				2968	return split_char(self,list,substring->str[0],maxcount);
				2969
				2970	else if (substring->length == 0) {
				2971	Py_DECREF(list);
				2972	PyErr_SetString(PyExc_ValueError, "empty separator");
				2973	return NULL;
				2974	}
				2975	else
				2976	return split_substring(self,list,substring,maxcount);
				2977	}
				2978
				2979	static
				2980	PyObject strip(PyUnicodeObject self,
				2981	int left,
				2982	int right)
				2983	{
				2984	Py_UNICODE *p = self->str;
				2985	int start = 0;
				2986	int end = self->length;
				2987
				2988	if (left)
				2989	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2990	start++;
				2991
				2992	if (right)
				2993	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2994	end--;
				2995
				2996	if (start == 0 && end == self->length) {
				2997	/* couldn't strip anything off, return original string */
				2998	Py_INCREF(self);
				2999	return (PyObject*) self;
				3000	}
				3001
				3002	return (PyObject*) PyUnicode_FromUnicode(
				3003	self->str + start,
				3004	end - start
				3005	);
				3006	}
				3007
				3008	static
				3009	PyObject replace(PyUnicodeObject self,
				3010	PyUnicodeObject *str1,
				3011	PyUnicodeObject *str2,
				3012	int maxcount)
				3013	{
				3014	PyUnicodeObject *u;
				3015
				3016	if (maxcount < 0)
				3017	maxcount = INT_MAX;
				3018
				3019	if (str1->length == 1 && str2->length == 1) {
				3020	int i;
				3021
				3022	/* replace characters */
				3023	if (!findchar(self->str, self->length, str1->str[0])) {
				3024	/* nothing to replace, return original string */
				3025	Py_INCREF(self);
				3026	u = self;
				3027	} else {
				3028	Py_UNICODE u1 = str1->str[0];
				3029	Py_UNICODE u2 = str2->str[0];
				3030
				3031	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3032	self->str,
				3033	self->length
				3034	);
				3035	if (u)
				3036	for (i = 0; i < u->length; i++)
				3037	if (u->str[i] == u1) {
				3038	if (--maxcount < 0)
				3039	break;
				3040	u->str[i] = u2;
				3041	}
				3042	}
				3043
				3044	} else {
				3045	int n, i;
				3046	Py_UNICODE *p;
				3047
				3048	/* replace strings */
				3049	n = count(self, 0, self->length, str1);
				3050	if (n > maxcount)
				3051	n = maxcount;
				3052	if (n == 0) {
				3053	/* nothing to replace, return original string */
				3054	Py_INCREF(self);
				3055	u = self;
				3056	} else {
				3057	u = _PyUnicode_New(
				3058	self->length + n * (str2->length - str1->length));
				3059	if (u) {
				3060	i = 0;
				3061	p = u->str;
				3062	while (i <= self->length - str1->length)
				3063	if (Py_UNICODE_MATCH(self, i, str1)) {
				3064	/* replace string segment */
				3065	Py_UNICODE_COPY(p, str2->str, str2->length);
				3066	p += str2->length;
				3067	i += str1->length;
				3068	if (--n <= 0) {
				3069	/* copy remaining part */
				3070	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3071	break;
				3072	}
				3073	} else
				3074	*p++ = self->str[i++];
				3075	}
				3076	}
				3077	}
				3078
				3079	return (PyObject *) u;
				3080	}
				3081
				3082	/* --- Unicode Object Methods --------------------------------------------- */
				3083
				3084	static char title__doc__[] =
				3085	"S.title() -> unicode\n\
				3086	\n\
				3087	Return a titlecased version of S, i.e. words start with title case\n\
				3088	characters, all remaining cased characters have lower case.";
				3089
				3090	static PyObject*
				3091	unicode_title(PyUnicodeObject self, PyObject args)
				3092	{
				3093	if (!PyArg_NoArgs(args))
				3094	return NULL;
				3095	return fixup(self, fixtitle);
				3096	}
				3097
				3098	static char capitalize__doc__[] =
				3099	"S.capitalize() -> unicode\n\
				3100	\n\
				3101	Return a capitalized version of S, i.e. make the first character\n\
				3102	have upper case.";
				3103
				3104	static PyObject*
				3105	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3106	{
				3107	if (!PyArg_NoArgs(args))
				3108	return NULL;
				3109	return fixup(self, fixcapitalize);
				3110	}
				3111
				3112	#if 0
				3113	static char capwords__doc__[] =
				3114	"S.capwords() -> unicode\n\
				3115	\n\
				3116	Apply .capitalize() to all words in S and return the result with\n\
				3117	normalized whitespace (all whitespace strings are replaced by ' ').";
				3118
				3119	static PyObject*
				3120	unicode_capwords(PyUnicodeObject self, PyObject args)
				3121	{
				3122	PyObject *list;
				3123	PyObject *item;
				3124	int i;
				3125
				3126	if (!PyArg_NoArgs(args))
				3127	return NULL;
				3128
				3129	/* Split into words */
				3130	list = split(self, NULL, -1);
				3131	if (!list)
				3132	return NULL;
				3133
				3134	/* Capitalize each word */
				3135	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3136	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3137	fixcapitalize);
				3138	if (item == NULL)
				3139	goto onError;
				3140	Py_DECREF(PyList_GET_ITEM(list, i));
				3141	PyList_SET_ITEM(list, i, item);
				3142	}
				3143
				3144	/* Join the words to form a new string */
				3145	item = PyUnicode_Join(NULL, list);
				3146
				3147	onError:
				3148	Py_DECREF(list);
				3149	return (PyObject *)item;
				3150	}
				3151	#endif
				3152
				3153	static char center__doc__[] =
				3154	"S.center(width) -> unicode\n\
				3155	\n\
				3156	Return S centered in a Unicode string of length width. Padding is done\n\
				3157	using spaces.";
				3158
				3159	static PyObject *
				3160	unicode_center(PyUnicodeObject self, PyObject args)
				3161	{
				3162	int marg, left;
				3163	int width;
				3164
				3165	if (!PyArg_ParseTuple(args, "i:center", &width))
				3166	return NULL;
				3167
				3168	if (self->length >= width) {
				3169	Py_INCREF(self);
				3170	return (PyObject*) self;
				3171	}
				3172
				3173	marg = width - self->length;
				3174	left = marg / 2 + (marg & width & 1);
				3175
				3176	return (PyObject*) pad(self, left, marg - left, ' ');
				3177	}
				3178
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3179	#if 0
				3180
				3181	/* This code should go into some future Unicode collation support
				3182	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3183	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3184
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3185	/* speedy UTF-16 code point order comparison */
				3186	/* gleaned from: */
				3187	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3188
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3189	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3190	{
				3191	0, 0, 0, 0, 0, 0, 0, 0,
				3192	0, 0, 0, 0, 0, 0, 0, 0,
				3193	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3194	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3195	};
				3196
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3197	static int
				3198	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3199	{
				3200	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3201
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3202	Py_UNICODE *s1 = str1->str;
				3203	Py_UNICODE *s2 = str2->str;
				3204
				3205	len1 = str1->length;
				3206	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3207
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3208	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3209	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3210	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3211
				3212	c1 = *s1++;
				3213	c2 = *s2++;
				3214	if (c1 > (1<<11) * 26)
				3215	c1 += utf16Fixup[c1>>11];
				3216	if (c2 > (1<<11) * 26)
				3217	c2 += utf16Fixup[c2>>11];
				3218
				3219	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3220	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3221	if (diff)
				3222	return (diff < 0) ? -1 : (diff != 0);
				3223	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3224	}
				3225
				3226	return (len1 < len2) ? -1 : (len1 != len2);
				3227	}
				3228
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3229	#else
				3230
				3231	static int
				3232	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3233	{
				3234	register int len1, len2;
				3235
				3236	Py_UNICODE *s1 = str1->str;
				3237	Py_UNICODE *s2 = str2->str;
				3238
				3239	len1 = str1->length;
				3240	len2 = str2->length;
				3241
				3242	while (len1 > 0 && len2 > 0) {
				3243	register long diff;
				3244
				3245	diff = (long)s1++ - (long)s2++;
				3246	if (diff)
				3247	return (diff < 0) ? -1 : (diff != 0);
				3248	len1--; len2--;
				3249	}
				3250
				3251	return (len1 < len2) ? -1 : (len1 != len2);
				3252	}
				3253
				3254	#endif
				3255
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3256	int PyUnicode_Compare(PyObject *left,
				3257	PyObject *right)
				3258	{
				3259	PyUnicodeObject u = NULL, v = NULL;
				3260	int result;
				3261
				3262	/* Coerce the two arguments */
				3263	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3264	if (u == NULL)
				3265	goto onError;
				3266	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3267	if (v == NULL)
				3268	goto onError;
				3269
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3270	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3271	if (v == u) {
				3272	Py_DECREF(u);
				3273	Py_DECREF(v);
				3274	return 0;
				3275	}
				3276
				3277	result = unicode_compare(u, v);
				3278
				3279	Py_DECREF(u);
				3280	Py_DECREF(v);
				3281	return result;
				3282
				3283	onError:
				3284	Py_XDECREF(u);
				3285	Py_XDECREF(v);
				3286	return -1;
				3287	}
				3288
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3289	int PyUnicode_Contains(PyObject *container,
				3290	PyObject *element)
				3291	{
				3292	PyUnicodeObject u = NULL, v = NULL;
				3293	int result;
				3294	register const Py_UNICODE p, e;
				3295	register Py_UNICODE ch;
				3296
				3297	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3298	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3299	if (v == NULL) {
				3300	PyErr_SetString(PyExc_TypeError,
				3301	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3302	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3303	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3304	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3305	if (u == NULL) {
				3306	Py_DECREF(v);
				3307	goto onError;
				3308	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3309
				3310	/* Check v in u */
				3311	if (PyUnicode_GET_SIZE(v) != 1) {
				3312	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3313	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3314	goto onError;
				3315	}
				3316	ch = *PyUnicode_AS_UNICODE(v);
				3317	p = PyUnicode_AS_UNICODE(u);
				3318	e = p + PyUnicode_GET_SIZE(u);
				3319	result = 0;
				3320	while (p < e) {
				3321	if (*p++ == ch) {
				3322	result = 1;
				3323	break;
				3324	}
				3325	}
				3326
				3327	Py_DECREF(u);
				3328	Py_DECREF(v);
				3329	return result;
				3330
				3331	onError:
				3332	Py_XDECREF(u);
				3333	Py_XDECREF(v);
				3334	return -1;
				3335	}
				3336
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3337	/* Concat to string or Unicode object giving a new Unicode object. */
				3338
				3339	PyObject PyUnicode_Concat(PyObject left,
				3340	PyObject *right)
				3341	{
				3342	PyUnicodeObject u = NULL, v = NULL, *w;
				3343
				3344	/* Coerce the two arguments */
				3345	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3346	if (u == NULL)
				3347	goto onError;
				3348	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3349	if (v == NULL)
				3350	goto onError;
				3351
				3352	/* Shortcuts */
				3353	if (v == unicode_empty) {
				3354	Py_DECREF(v);
				3355	return (PyObject *)u;
				3356	}
				3357	if (u == unicode_empty) {
				3358	Py_DECREF(u);
				3359	return (PyObject *)v;
				3360	}
				3361
				3362	/* Concat the two Unicode strings */
				3363	w = _PyUnicode_New(u->length + v->length);
				3364	if (w == NULL)
				3365	goto onError;
				3366	Py_UNICODE_COPY(w->str, u->str, u->length);
				3367	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3368
				3369	Py_DECREF(u);
				3370	Py_DECREF(v);
				3371	return (PyObject *)w;
				3372
				3373	onError:
				3374	Py_XDECREF(u);
				3375	Py_XDECREF(v);
				3376	return NULL;
				3377	}
				3378
				3379	static char count__doc__[] =
				3380	"S.count(sub[, start[, end]]) -> int\n\
				3381	\n\
				3382	Return the number of occurrences of substring sub in Unicode string\n\
				3383	S[start:end]. Optional arguments start and end are\n\
				3384	interpreted as in slice notation.";
				3385
				3386	static PyObject *
				3387	unicode_count(PyUnicodeObject self, PyObject args)
				3388	{
				3389	PyUnicodeObject *substring;
				3390	int start = 0;
				3391	int end = INT_MAX;
				3392	PyObject *result;
				3393
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3394	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3395	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3396	return NULL;
				3397
				3398	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3399	(PyObject *)substring);
				3400	if (substring == NULL)
				3401	return NULL;
				3402
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3403	if (start < 0)
				3404	start += self->length;
				3405	if (start < 0)
				3406	start = 0;
				3407	if (end > self->length)
				3408	end = self->length;
				3409	if (end < 0)
				3410	end += self->length;
				3411	if (end < 0)
				3412	end = 0;
				3413
				3414	result = PyInt_FromLong((long) count(self, start, end, substring));
				3415
				3416	Py_DECREF(substring);
				3417	return result;
				3418	}
				3419
				3420	static char encode__doc__[] =
				3421	"S.encode([encoding[,errors]]) -> string\n\
				3422	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3423	Return an encoded string version of S. Default encoding is the current\n\
				3424	default string encoding. errors may be given to set a different error\n\
				3425	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3426	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3427
				3428	static PyObject *
				3429	unicode_encode(PyUnicodeObject self, PyObject args)
				3430	{
				3431	char *encoding = NULL;
				3432	char *errors = NULL;
				3433	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3434	return NULL;
				3435	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3436	}
				3437
				3438	static char expandtabs__doc__[] =
				3439	"S.expandtabs([tabsize]) -> unicode\n\
				3440	\n\
				3441	Return a copy of S where all tab characters are expanded using spaces.\n\
				3442	If tabsize is not given, a tab size of 8 characters is assumed.";
				3443
				3444	static PyObject*
				3445	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3446	{
				3447	Py_UNICODE *e;
				3448	Py_UNICODE *p;
				3449	Py_UNICODE *q;
				3450	int i, j;
				3451	PyUnicodeObject *u;
				3452	int tabsize = 8;
				3453
				3454	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3455	return NULL;
				3456
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3457	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3458	i = j = 0;
				3459	e = self->str + self->length;
				3460	for (p = self->str; p < e; p++)
				3461	if (*p == '\t') {
				3462	if (tabsize > 0)
				3463	j += tabsize - (j % tabsize);
				3464	}
				3465	else {
				3466	j++;
				3467	if (p == '\n' \|\| p == '\r') {
				3468	i += j;
				3469	j = 0;
				3470	}
				3471	}
				3472
				3473	/* Second pass: create output string and fill it */
				3474	u = _PyUnicode_New(i + j);
				3475	if (!u)
				3476	return NULL;
				3477
				3478	j = 0;
				3479	q = u->str;
				3480
				3481	for (p = self->str; p < e; p++)
				3482	if (*p == '\t') {
				3483	if (tabsize > 0) {
				3484	i = tabsize - (j % tabsize);
				3485	j += i;
				3486	while (i--)
				3487	*q++ = ' ';
				3488	}
				3489	}
				3490	else {
				3491	j++;
				3492	q++ = p;
				3493	if (p == '\n' \|\| p == '\r')
				3494	j = 0;
				3495	}
				3496
				3497	return (PyObject*) u;
				3498	}
				3499
				3500	static char find__doc__[] =
				3501	"S.find(sub [,start [,end]]) -> int\n\
				3502	\n\
				3503	Return the lowest index in S where substring sub is found,\n\
				3504	such that sub is contained within s[start,end]. Optional\n\
				3505	arguments start and end are interpreted as in slice notation.\n\
				3506	\n\
				3507	Return -1 on failure.";
				3508
				3509	static PyObject *
				3510	unicode_find(PyUnicodeObject self, PyObject args)
				3511	{
				3512	PyUnicodeObject *substring;
				3513	int start = 0;
				3514	int end = INT_MAX;
				3515	PyObject *result;
				3516
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3517	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3518	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3519	return NULL;
				3520	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3521	(PyObject *)substring);
				3522	if (substring == NULL)
				3523	return NULL;
				3524
				3525	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3526
				3527	Py_DECREF(substring);
				3528	return result;
				3529	}
				3530
				3531	static PyObject *
				3532	unicode_getitem(PyUnicodeObject *self, int index)
				3533	{
				3534	if (index < 0 \|\| index >= self->length) {
				3535	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3536	return NULL;
				3537	}
				3538
				3539	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3540	}
				3541
				3542	static long
				3543	unicode_hash(PyUnicodeObject *self)
				3544	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3545	/* Since Unicode objects compare equal to their ASCII string
				3546	counterparts, they should use the individual character values
				3547	as basis for their hash value. This is needed to assure that
				3548	strings and Unicode objects behave in the same way as
				3549	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3550
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3551	register int len;
				3552	register Py_UNICODE *p;
				3553	register long x;
				3554
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3555	if (self->hash != -1)
				3556	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3557	len = PyUnicode_GET_SIZE(self);
				3558	p = PyUnicode_AS_UNICODE(self);
				3559	x = *p << 7;
				3560	while (--len >= 0)
				3561	x = (1000003x) ^ p++;
				3562	x ^= PyUnicode_GET_SIZE(self);
				3563	if (x == -1)
				3564	x = -2;
				3565	self->hash = x;
				3566	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3567	}
				3568
				3569	static char index__doc__[] =
				3570	"S.index(sub [,start [,end]]) -> int\n\
				3571	\n\
				3572	Like S.find() but raise ValueError when the substring is not found.";
				3573
				3574	static PyObject *
				3575	unicode_index(PyUnicodeObject self, PyObject args)
				3576	{
				3577	int result;
				3578	PyUnicodeObject *substring;
				3579	int start = 0;
				3580	int end = INT_MAX;
				3581
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3582	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3583	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3584	return NULL;
				3585
				3586	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3587	(PyObject *)substring);
				3588	if (substring == NULL)
				3589	return NULL;
				3590
				3591	result = findstring(self, substring, start, end, 1);
				3592
				3593	Py_DECREF(substring);
				3594	if (result < 0) {
				3595	PyErr_SetString(PyExc_ValueError, "substring not found");
				3596	return NULL;
				3597	}
				3598	return PyInt_FromLong(result);
				3599	}
				3600
				3601	static char islower__doc__[] =
				3602	"S.islower() -> int\n\
				3603	\n\
				3604	Return 1 if all cased characters in S are lowercase and there is\n\
				3605	at least one cased character in S, 0 otherwise.";
				3606
				3607	static PyObject*
				3608	unicode_islower(PyUnicodeObject self, PyObject args)
				3609	{
				3610	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3611	register const Py_UNICODE *e;
				3612	int cased;
				3613
				3614	if (!PyArg_NoArgs(args))
				3615	return NULL;
				3616
				3617	/* Shortcut for single character strings */
				3618	if (PyUnicode_GET_SIZE(self) == 1)
				3619	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3620
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3621	/* Special case for empty strings */
				3622	if (PyString_GET_SIZE(self) == 0)
				3623	return PyInt_FromLong(0);
				3624
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3625	e = p + PyUnicode_GET_SIZE(self);
				3626	cased = 0;
				3627	for (; p < e; p++) {
				3628	register const Py_UNICODE ch = *p;
				3629
				3630	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3631	return PyInt_FromLong(0);
				3632	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3633	cased = 1;
				3634	}
				3635	return PyInt_FromLong(cased);
				3636	}
				3637
				3638	static char isupper__doc__[] =
				3639	"S.isupper() -> int\n\
				3640	\n\
				3641	Return 1 if all cased characters in S are uppercase and there is\n\
				3642	at least one cased character in S, 0 otherwise.";
				3643
				3644	static PyObject*
				3645	unicode_isupper(PyUnicodeObject self, PyObject args)
				3646	{
				3647	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3648	register const Py_UNICODE *e;
				3649	int cased;
				3650
				3651	if (!PyArg_NoArgs(args))
				3652	return NULL;
				3653
				3654	/* Shortcut for single character strings */
				3655	if (PyUnicode_GET_SIZE(self) == 1)
				3656	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3657
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3658	/* Special case for empty strings */
				3659	if (PyString_GET_SIZE(self) == 0)
				3660	return PyInt_FromLong(0);
				3661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3662	e = p + PyUnicode_GET_SIZE(self);
				3663	cased = 0;
				3664	for (; p < e; p++) {
				3665	register const Py_UNICODE ch = *p;
				3666
				3667	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3668	return PyInt_FromLong(0);
				3669	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3670	cased = 1;
				3671	}
				3672	return PyInt_FromLong(cased);
				3673	}
				3674
				3675	static char istitle__doc__[] =
				3676	"S.istitle() -> int\n\
				3677	\n\
				3678	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3679	may only follow uncased characters and lowercase characters only cased\n\
				3680	ones. Return 0 otherwise.";
				3681
				3682	static PyObject*
				3683	unicode_istitle(PyUnicodeObject self, PyObject args)
				3684	{
				3685	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3686	register const Py_UNICODE *e;
				3687	int cased, previous_is_cased;
				3688
				3689	if (!PyArg_NoArgs(args))
				3690	return NULL;
				3691
				3692	/* Shortcut for single character strings */
				3693	if (PyUnicode_GET_SIZE(self) == 1)
				3694	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3695	(Py_UNICODE_ISUPPER(*p) != 0));
				3696
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3697	/* Special case for empty strings */
				3698	if (PyString_GET_SIZE(self) == 0)
				3699	return PyInt_FromLong(0);
				3700
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3701	e = p + PyUnicode_GET_SIZE(self);
				3702	cased = 0;
				3703	previous_is_cased = 0;
				3704	for (; p < e; p++) {
				3705	register const Py_UNICODE ch = *p;
				3706
				3707	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3708	if (previous_is_cased)
				3709	return PyInt_FromLong(0);
				3710	previous_is_cased = 1;
				3711	cased = 1;
				3712	}
				3713	else if (Py_UNICODE_ISLOWER(ch)) {
				3714	if (!previous_is_cased)
				3715	return PyInt_FromLong(0);
				3716	previous_is_cased = 1;
				3717	cased = 1;
				3718	}
				3719	else
				3720	previous_is_cased = 0;
				3721	}
				3722	return PyInt_FromLong(cased);
				3723	}
				3724
				3725	static char isspace__doc__[] =
				3726	"S.isspace() -> int\n\
				3727	\n\
				3728	Return 1 if there are only whitespace characters in S,\n\
				3729	0 otherwise.";
				3730
				3731	static PyObject*
				3732	unicode_isspace(PyUnicodeObject self, PyObject args)
				3733	{
				3734	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3735	register const Py_UNICODE *e;
				3736
				3737	if (!PyArg_NoArgs(args))
				3738	return NULL;
				3739
				3740	/* Shortcut for single character strings */
				3741	if (PyUnicode_GET_SIZE(self) == 1 &&
				3742	Py_UNICODE_ISSPACE(*p))
				3743	return PyInt_FromLong(1);
				3744
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3745	/* Special case for empty strings */
				3746	if (PyString_GET_SIZE(self) == 0)
				3747	return PyInt_FromLong(0);
				3748
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3749	e = p + PyUnicode_GET_SIZE(self);
				3750	for (; p < e; p++) {
				3751	if (!Py_UNICODE_ISSPACE(*p))
				3752	return PyInt_FromLong(0);
				3753	}
				3754	return PyInt_FromLong(1);
				3755	}
				3756
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3757	static char isalpha__doc__[] =
				3758	"S.isalpha() -> int\n\
				3759	\n\
				3760	Return 1 if all characters in S are alphabetic\n\
				3761	and there is at least one character in S, 0 otherwise.";
				3762
				3763	static PyObject*
				3764	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3765	{
				3766	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3767	register const Py_UNICODE *e;
				3768
				3769	if (!PyArg_NoArgs(args))
				3770	return NULL;
				3771
				3772	/* Shortcut for single character strings */
				3773	if (PyUnicode_GET_SIZE(self) == 1 &&
				3774	Py_UNICODE_ISALPHA(*p))
				3775	return PyInt_FromLong(1);
				3776
				3777	/* Special case for empty strings */
				3778	if (PyString_GET_SIZE(self) == 0)
				3779	return PyInt_FromLong(0);
				3780
				3781	e = p + PyUnicode_GET_SIZE(self);
				3782	for (; p < e; p++) {
				3783	if (!Py_UNICODE_ISALPHA(*p))
				3784	return PyInt_FromLong(0);
				3785	}
				3786	return PyInt_FromLong(1);
				3787	}
				3788
				3789	static char isalnum__doc__[] =
				3790	"S.isalnum() -> int\n\
				3791	\n\
				3792	Return 1 if all characters in S are alphanumeric\n\
				3793	and there is at least one character in S, 0 otherwise.";
				3794
				3795	static PyObject*
				3796	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3797	{
				3798	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3799	register const Py_UNICODE *e;
				3800
				3801	if (!PyArg_NoArgs(args))
				3802	return NULL;
				3803
				3804	/* Shortcut for single character strings */
				3805	if (PyUnicode_GET_SIZE(self) == 1 &&
				3806	Py_UNICODE_ISALNUM(*p))
				3807	return PyInt_FromLong(1);
				3808
				3809	/* Special case for empty strings */
				3810	if (PyString_GET_SIZE(self) == 0)
				3811	return PyInt_FromLong(0);
				3812
				3813	e = p + PyUnicode_GET_SIZE(self);
				3814	for (; p < e; p++) {
				3815	if (!Py_UNICODE_ISALNUM(*p))
				3816	return PyInt_FromLong(0);
				3817	}
				3818	return PyInt_FromLong(1);
				3819	}
				3820
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3821	static char isdecimal__doc__[] =
				3822	"S.isdecimal() -> int\n\
				3823	\n\
				3824	Return 1 if there are only decimal characters in S,\n\
				3825	0 otherwise.";
				3826
				3827	static PyObject*
				3828	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3829	{
				3830	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3831	register const Py_UNICODE *e;
				3832
				3833	if (!PyArg_NoArgs(args))
				3834	return NULL;
				3835
				3836	/* Shortcut for single character strings */
				3837	if (PyUnicode_GET_SIZE(self) == 1 &&
				3838	Py_UNICODE_ISDECIMAL(*p))
				3839	return PyInt_FromLong(1);
				3840
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3841	/* Special case for empty strings */
				3842	if (PyString_GET_SIZE(self) == 0)
				3843	return PyInt_FromLong(0);
				3844
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3845	e = p + PyUnicode_GET_SIZE(self);
				3846	for (; p < e; p++) {
				3847	if (!Py_UNICODE_ISDECIMAL(*p))
				3848	return PyInt_FromLong(0);
				3849	}
				3850	return PyInt_FromLong(1);
				3851	}
				3852
				3853	static char isdigit__doc__[] =
				3854	"S.isdigit() -> int\n\
				3855	\n\
				3856	Return 1 if there are only digit characters in S,\n\
				3857	0 otherwise.";
				3858
				3859	static PyObject*
				3860	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3861	{
				3862	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3863	register const Py_UNICODE *e;
				3864
				3865	if (!PyArg_NoArgs(args))
				3866	return NULL;
				3867
				3868	/* Shortcut for single character strings */
				3869	if (PyUnicode_GET_SIZE(self) == 1 &&
				3870	Py_UNICODE_ISDIGIT(*p))
				3871	return PyInt_FromLong(1);
				3872
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3873	/* Special case for empty strings */
				3874	if (PyString_GET_SIZE(self) == 0)
				3875	return PyInt_FromLong(0);
				3876
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3877	e = p + PyUnicode_GET_SIZE(self);
				3878	for (; p < e; p++) {
				3879	if (!Py_UNICODE_ISDIGIT(*p))
				3880	return PyInt_FromLong(0);
				3881	}
				3882	return PyInt_FromLong(1);
				3883	}
				3884
				3885	static char isnumeric__doc__[] =
				3886	"S.isnumeric() -> int\n\
				3887	\n\
				3888	Return 1 if there are only numeric characters in S,\n\
				3889	0 otherwise.";
				3890
				3891	static PyObject*
				3892	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3893	{
				3894	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3895	register const Py_UNICODE *e;
				3896
				3897	if (!PyArg_NoArgs(args))
				3898	return NULL;
				3899
				3900	/* Shortcut for single character strings */
				3901	if (PyUnicode_GET_SIZE(self) == 1 &&
				3902	Py_UNICODE_ISNUMERIC(*p))
				3903	return PyInt_FromLong(1);
				3904
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3905	/* Special case for empty strings */
				3906	if (PyString_GET_SIZE(self) == 0)
				3907	return PyInt_FromLong(0);
				3908
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3909	e = p + PyUnicode_GET_SIZE(self);
				3910	for (; p < e; p++) {
				3911	if (!Py_UNICODE_ISNUMERIC(*p))
				3912	return PyInt_FromLong(0);
				3913	}
				3914	return PyInt_FromLong(1);
				3915	}
				3916
				3917	static char join__doc__[] =
				3918	"S.join(sequence) -> unicode\n\
				3919	\n\
				3920	Return a string which is the concatenation of the strings in the\n\
				3921	sequence. The separator between elements is S.";
				3922
				3923	static PyObject*
				3924	unicode_join(PyUnicodeObject self, PyObject args)
				3925	{
				3926	PyObject *data;
				3927	if (!PyArg_ParseTuple(args, "O:join", &data))
				3928	return NULL;
				3929
				3930	return PyUnicode_Join((PyObject *)self, data);
				3931	}
				3932
				3933	static int
				3934	unicode_length(PyUnicodeObject *self)
				3935	{
				3936	return self->length;
				3937	}
				3938
				3939	static char ljust__doc__[] =
				3940	"S.ljust(width) -> unicode\n\
				3941	\n\
				3942	Return S left justified in a Unicode string of length width. Padding is\n\
				3943	done using spaces.";
				3944
				3945	static PyObject *
				3946	unicode_ljust(PyUnicodeObject self, PyObject args)
				3947	{
				3948	int width;
				3949	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3950	return NULL;
				3951
				3952	if (self->length >= width) {
				3953	Py_INCREF(self);
				3954	return (PyObject*) self;
				3955	}
				3956
				3957	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3958	}
				3959
				3960	static char lower__doc__[] =
				3961	"S.lower() -> unicode\n\
				3962	\n\
				3963	Return a copy of the string S converted to lowercase.";
				3964
				3965	static PyObject*
				3966	unicode_lower(PyUnicodeObject self, PyObject args)
				3967	{
				3968	if (!PyArg_NoArgs(args))
				3969	return NULL;
				3970	return fixup(self, fixlower);
				3971	}
				3972
				3973	static char lstrip__doc__[] =
				3974	"S.lstrip() -> unicode\n\
				3975	\n\
				3976	Return a copy of the string S with leading whitespace removed.";
				3977
				3978	static PyObject *
				3979	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3980	{
				3981	if (!PyArg_NoArgs(args))
				3982	return NULL;
				3983	return strip(self, 1, 0);
				3984	}
				3985
				3986	static PyObject*
				3987	unicode_repeat(PyUnicodeObject *str, int len)
				3988	{
				3989	PyUnicodeObject *u;
				3990	Py_UNICODE *p;
				3991
				3992	if (len < 0)
				3993	len = 0;
				3994
				3995	if (len == 1) {
				3996	/* no repeat, return original string */
				3997	Py_INCREF(str);
				3998	return (PyObject*) str;
				3999	}
				4000
				4001	u = _PyUnicode_New(len * str->length);
				4002	if (!u)
				4003	return NULL;
				4004
				4005	p = u->str;
				4006
				4007	while (len-- > 0) {
				4008	Py_UNICODE_COPY(p, str->str, str->length);
				4009	p += str->length;
				4010	}
				4011
				4012	return (PyObject*) u;
				4013	}
				4014
				4015	PyObject PyUnicode_Replace(PyObject obj,
				4016	PyObject *subobj,
				4017	PyObject *replobj,
				4018	int maxcount)
				4019	{
				4020	PyObject *self;
				4021	PyObject *str1;
				4022	PyObject *str2;
				4023	PyObject *result;
				4024
				4025	self = PyUnicode_FromObject(obj);
				4026	if (self == NULL)
				4027	return NULL;
				4028	str1 = PyUnicode_FromObject(subobj);
				4029	if (str1 == NULL) {
				4030	Py_DECREF(self);
				4031	return NULL;
				4032	}
				4033	str2 = PyUnicode_FromObject(replobj);
				4034	if (str2 == NULL) {
				4035	Py_DECREF(self);
				4036	Py_DECREF(str1);
				4037	return NULL;
				4038	}
				4039	result = replace((PyUnicodeObject *)self,
				4040	(PyUnicodeObject *)str1,
				4041	(PyUnicodeObject *)str2,
				4042	maxcount);
				4043	Py_DECREF(self);
				4044	Py_DECREF(str1);
				4045	Py_DECREF(str2);
				4046	return result;
				4047	}
				4048
				4049	static char replace__doc__[] =
				4050	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4051	\n\
				4052	Return a copy of S with all occurrences of substring\n\
				4053	old replaced by new. If the optional argument maxsplit is\n\
				4054	given, only the first maxsplit occurrences are replaced.";
				4055
				4056	static PyObject*
				4057	unicode_replace(PyUnicodeObject self, PyObject args)
				4058	{
				4059	PyUnicodeObject *str1;
				4060	PyUnicodeObject *str2;
				4061	int maxcount = -1;
				4062	PyObject *result;
				4063
				4064	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4065	return NULL;
				4066	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4067	if (str1 == NULL)
				4068	return NULL;
				4069	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4070	if (str2 == NULL)
				4071	return NULL;
				4072
				4073	result = replace(self, str1, str2, maxcount);
				4074
				4075	Py_DECREF(str1);
				4076	Py_DECREF(str2);
				4077	return result;
				4078	}
				4079
				4080	static
				4081	PyObject unicode_repr(PyObject unicode)
				4082	{
				4083	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4084	PyUnicode_GET_SIZE(unicode),
				4085	1);
				4086	}
				4087
				4088	static char rfind__doc__[] =
				4089	"S.rfind(sub [,start [,end]]) -> int\n\
				4090	\n\
				4091	Return the highest index in S where substring sub is found,\n\
				4092	such that sub is contained within s[start,end]. Optional\n\
				4093	arguments start and end are interpreted as in slice notation.\n\
				4094	\n\
				4095	Return -1 on failure.";
				4096
				4097	static PyObject *
				4098	unicode_rfind(PyUnicodeObject self, PyObject args)
				4099	{
				4100	PyUnicodeObject *substring;
				4101	int start = 0;
				4102	int end = INT_MAX;
				4103	PyObject *result;
				4104
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4105	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4106	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4107	return NULL;
				4108	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4109	(PyObject *)substring);
				4110	if (substring == NULL)
				4111	return NULL;
				4112
				4113	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4114
				4115	Py_DECREF(substring);
				4116	return result;
				4117	}
				4118
				4119	static char rindex__doc__[] =
				4120	"S.rindex(sub [,start [,end]]) -> int\n\
				4121	\n\
				4122	Like S.rfind() but raise ValueError when the substring is not found.";
				4123
				4124	static PyObject *
				4125	unicode_rindex(PyUnicodeObject self, PyObject args)
				4126	{
				4127	int result;
				4128	PyUnicodeObject *substring;
				4129	int start = 0;
				4130	int end = INT_MAX;
				4131
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4132	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4133	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4134	return NULL;
				4135	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4136	(PyObject *)substring);
				4137	if (substring == NULL)
				4138	return NULL;
				4139
				4140	result = findstring(self, substring, start, end, -1);
				4141
				4142	Py_DECREF(substring);
				4143	if (result < 0) {
				4144	PyErr_SetString(PyExc_ValueError, "substring not found");
				4145	return NULL;
				4146	}
				4147	return PyInt_FromLong(result);
				4148	}
				4149
				4150	static char rjust__doc__[] =
				4151	"S.rjust(width) -> unicode\n\
				4152	\n\
				4153	Return S right justified in a Unicode string of length width. Padding is\n\
				4154	done using spaces.";
				4155
				4156	static PyObject *
				4157	unicode_rjust(PyUnicodeObject self, PyObject args)
				4158	{
				4159	int width;
				4160	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4161	return NULL;
				4162
				4163	if (self->length >= width) {
				4164	Py_INCREF(self);
				4165	return (PyObject*) self;
				4166	}
				4167
				4168	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4169	}
				4170
				4171	static char rstrip__doc__[] =
				4172	"S.rstrip() -> unicode\n\
				4173	\n\
				4174	Return a copy of the string S with trailing whitespace removed.";
				4175
				4176	static PyObject *
				4177	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4178	{
				4179	if (!PyArg_NoArgs(args))
				4180	return NULL;
				4181	return strip(self, 0, 1);
				4182	}
				4183
				4184	static PyObject*
				4185	unicode_slice(PyUnicodeObject *self, int start, int end)
				4186	{
				4187	/* standard clamping */
				4188	if (start < 0)
				4189	start = 0;
				4190	if (end < 0)
				4191	end = 0;
				4192	if (end > self->length)
				4193	end = self->length;
				4194	if (start == 0 && end == self->length) {
				4195	/* full slice, return original string */
				4196	Py_INCREF(self);
				4197	return (PyObject*) self;
				4198	}
				4199	if (start > end)
				4200	start = end;
				4201	/* copy slice */
				4202	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4203	end - start);
				4204	}
				4205
				4206	PyObject PyUnicode_Split(PyObject s,
				4207	PyObject *sep,
				4208	int maxsplit)
				4209	{
				4210	PyObject *result;
				4211
				4212	s = PyUnicode_FromObject(s);
				4213	if (s == NULL)
				4214	return NULL;
				4215	if (sep != NULL) {
				4216	sep = PyUnicode_FromObject(sep);
				4217	if (sep == NULL) {
				4218	Py_DECREF(s);
				4219	return NULL;
				4220	}
				4221	}
				4222
				4223	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4224
				4225	Py_DECREF(s);
				4226	Py_XDECREF(sep);
				4227	return result;
				4228	}
				4229
				4230	static char split__doc__[] =
				4231	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4232	\n\
				4233	Return a list of the words in S, using sep as the\n\
				4234	delimiter string. If maxsplit is given, at most maxsplit\n\
				4235	splits are done. If sep is not specified, any whitespace string\n\
				4236	is a separator.";
				4237
				4238	static PyObject*
				4239	unicode_split(PyUnicodeObject self, PyObject args)
				4240	{
				4241	PyObject *substring = Py_None;
				4242	int maxcount = -1;
				4243
				4244	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4245	return NULL;
				4246
				4247	if (substring == Py_None)
				4248	return split(self, NULL, maxcount);
				4249	else if (PyUnicode_Check(substring))
				4250	return split(self, (PyUnicodeObject *)substring, maxcount);
				4251	else
				4252	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4253	}
				4254
				4255	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4256	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4257	\n\
				4258	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4259	Line breaks are not included in the resulting list unless keepends\n\
				4260	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4261
				4262	static PyObject*
				4263	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4264	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4265	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4266
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4267	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4268	return NULL;
				4269
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4270	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4271	}
				4272
				4273	static
				4274	PyObject unicode_str(PyUnicodeObject self)
				4275	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4276	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4277	}
				4278
				4279	static char strip__doc__[] =
				4280	"S.strip() -> unicode\n\
				4281	\n\
				4282	Return a copy of S with leading and trailing whitespace removed.";
				4283
				4284	static PyObject *
				4285	unicode_strip(PyUnicodeObject self, PyObject args)
				4286	{
				4287	if (!PyArg_NoArgs(args))
				4288	return NULL;
				4289	return strip(self, 1, 1);
				4290	}
				4291
				4292	static char swapcase__doc__[] =
				4293	"S.swapcase() -> unicode\n\
				4294	\n\
				4295	Return a copy of S with uppercase characters converted to lowercase\n\
				4296	and vice versa.";
				4297
				4298	static PyObject*
				4299	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4300	{
				4301	if (!PyArg_NoArgs(args))
				4302	return NULL;
				4303	return fixup(self, fixswapcase);
				4304	}
				4305
				4306	static char translate__doc__[] =
				4307	"S.translate(table) -> unicode\n\
				4308	\n\
				4309	Return a copy of the string S, where all characters have been mapped\n\
				4310	through the given translation table, which must be a mapping of\n\
				4311	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4312	are left untouched. Characters mapped to None are deleted.";
				4313
				4314	static PyObject*
				4315	unicode_translate(PyUnicodeObject self, PyObject args)
				4316	{
				4317	PyObject *table;
				4318
				4319	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4320	return NULL;
				4321	return PyUnicode_TranslateCharmap(self->str,
				4322	self->length,
				4323	table,
				4324	"ignore");
				4325	}
				4326
				4327	static char upper__doc__[] =
				4328	"S.upper() -> unicode\n\
				4329	\n\
				4330	Return a copy of S converted to uppercase.";
				4331
				4332	static PyObject*
				4333	unicode_upper(PyUnicodeObject self, PyObject args)
				4334	{
				4335	if (!PyArg_NoArgs(args))
				4336	return NULL;
				4337	return fixup(self, fixupper);
				4338	}
				4339
				4340	#if 0
				4341	static char zfill__doc__[] =
				4342	"S.zfill(width) -> unicode\n\
				4343	\n\
				4344	Pad a numeric string x with zeros on the left, to fill a field\n\
				4345	of the specified width. The string x is never truncated.";
				4346
				4347	static PyObject *
				4348	unicode_zfill(PyUnicodeObject self, PyObject args)
				4349	{
				4350	int fill;
				4351	PyUnicodeObject *u;
				4352
				4353	int width;
				4354	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4355	return NULL;
				4356
				4357	if (self->length >= width) {
				4358	Py_INCREF(self);
				4359	return (PyObject*) self;
				4360	}
				4361
				4362	fill = width - self->length;
				4363
				4364	u = pad(self, fill, 0, '0');
				4365
				4366	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4367	/* move sign to beginning of string */
				4368	u->str[0] = u->str[fill];
				4369	u->str[fill] = '0';
				4370	}
				4371
				4372	return (PyObject*) u;
				4373	}
				4374	#endif
				4375
				4376	#if 0
				4377	static PyObject*
				4378	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4379	{
				4380	if (!PyArg_NoArgs(args))
				4381	return NULL;
				4382	return PyInt_FromLong(unicode_freelist_size);
				4383	}
				4384	#endif
				4385
				4386	static char startswith__doc__[] =
				4387	"S.startswith(prefix[, start[, end]]) -> int\n\
				4388	\n\
				4389	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4390	optional start, test S beginning at that position. With optional end, stop\n\
				4391	comparing S at that position.";
				4392
				4393	static PyObject *
				4394	unicode_startswith(PyUnicodeObject *self,
				4395	PyObject *args)
				4396	{
				4397	PyUnicodeObject *substring;
				4398	int start = 0;
				4399	int end = INT_MAX;
				4400	PyObject *result;
				4401
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4402	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4403	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4404	return NULL;
				4405	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4406	(PyObject *)substring);
				4407	if (substring == NULL)
				4408	return NULL;
				4409
				4410	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4411
				4412	Py_DECREF(substring);
				4413	return result;
				4414	}
				4415
				4416
				4417	static char endswith__doc__[] =
				4418	"S.endswith(suffix[, start[, end]]) -> int\n\
				4419	\n\
				4420	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4421	optional start, test S beginning at that position. With optional end, stop\n\
				4422	comparing S at that position.";
				4423
				4424	static PyObject *
				4425	unicode_endswith(PyUnicodeObject *self,
				4426	PyObject *args)
				4427	{
				4428	PyUnicodeObject *substring;
				4429	int start = 0;
				4430	int end = INT_MAX;
				4431	PyObject *result;
				4432
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4433	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4434	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4435	return NULL;
				4436	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4437	(PyObject *)substring);
				4438	if (substring == NULL)
				4439	return NULL;
				4440
				4441	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4442
				4443	Py_DECREF(substring);
				4444	return result;
				4445	}
				4446
				4447
				4448	static PyMethodDef unicode_methods[] = {
				4449
				4450	/* Order is according to common usage: often used methods should
				4451	appear first, since lookup is done sequentially. */
				4452
				4453	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4454	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4455	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4456	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4457	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4458	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4459	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4460	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4461	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4462	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4463	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4464	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4465	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4466	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4467	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4468	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4469	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4470	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4471	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4472	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4473	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4474	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4475	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4476	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4477	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4478	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4479	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4480	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4481	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4482	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4483	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4484	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4485	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4486	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4487	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4488	#if 0
				4489	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4490	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4491	#endif
				4492
				4493	#if 0
				4494	/* This one is just used for debugging the implementation. */
				4495	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4496	#endif
				4497
				4498	{NULL, NULL}
				4499	};
				4500
				4501	static PyObject *
				4502	unicode_getattr(PyUnicodeObject self, char name)
				4503	{
				4504	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4505	}
				4506
				4507	static PySequenceMethods unicode_as_sequence = {
				4508	(inquiry) unicode_length, /* sq_length */
				4509	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4510	(intargfunc) unicode_repeat, /* sq_repeat */
				4511	(intargfunc) unicode_getitem, /* sq_item */
				4512	(intintargfunc) unicode_slice, /* sq_slice */
				4513	0, /* sq_ass_item */
				4514	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4515	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4516	};
				4517
				4518	static int
				4519	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4520	int index,
				4521	const void **ptr)
				4522	{
				4523	if (index != 0) {
				4524	PyErr_SetString(PyExc_SystemError,
				4525	"accessing non-existent unicode segment");
				4526	return -1;
				4527	}
				4528	ptr = (void ) self->str;
				4529	return PyUnicode_GET_DATA_SIZE(self);
				4530	}
				4531
				4532	static int
				4533	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4534	const void **ptr)
				4535	{
				4536	PyErr_SetString(PyExc_TypeError,
				4537	"cannot use unicode as modifyable buffer");
				4538	return -1;
				4539	}
				4540
				4541	static int
				4542	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4543	int *lenp)
				4544	{
				4545	if (lenp)
				4546	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4547	return 1;
				4548	}
				4549
				4550	static int
				4551	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4552	int index,
				4553	const void **ptr)
				4554	{
				4555	PyObject *str;
				4556
				4557	if (index != 0) {
				4558	PyErr_SetString(PyExc_SystemError,
				4559	"accessing non-existent unicode segment");
				4560	return -1;
				4561	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4562	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4563	if (str == NULL)
				4564	return -1;
				4565	ptr = (void ) PyString_AS_STRING(str);
				4566	return PyString_GET_SIZE(str);
				4567	}
				4568
				4569	/* Helpers for PyUnicode_Format() */
				4570
				4571	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4572	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4573	{
				4574	int argidx = *p_argidx;
				4575	if (argidx < arglen) {
				4576	(*p_argidx)++;
				4577	if (arglen < 0)
				4578	return args;
				4579	else
				4580	return PyTuple_GetItem(args, argidx);
				4581	}
				4582	PyErr_SetString(PyExc_TypeError,
				4583	"not enough arguments for format string");
				4584	return NULL;
				4585	}
				4586
				4587	#define F_LJUST (1<<0)
				4588	#define F_SIGN (1<<1)
				4589	#define F_BLANK (1<<2)
				4590	#define F_ALT (1<<3)
				4591	#define F_ZERO (1<<4)
				4592
				4593	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4594	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4595	{
				4596	register int i;
				4597	int len;
				4598	va_list va;
				4599	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4600	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4601
				4602	/* First, format the string as char array, then expand to Py_UNICODE
				4603	array. */
				4604	charbuffer = (char *)buffer;
				4605	len = vsprintf(charbuffer, format, va);
				4606	for (i = len - 1; i >= 0; i--)
				4607	buffer[i] = (Py_UNICODE) charbuffer[i];
				4608
				4609	va_end(va);
				4610	return len;
				4611	}
				4612
				4613	static int
				4614	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4615	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4616	int flags,
				4617	int prec,
				4618	int type,
				4619	PyObject *v)
				4620	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4621	/* fmt = '%#.' + `prec` + `type`
				4622	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4623	char fmt[20];
				4624	double x;
				4625
				4626	x = PyFloat_AsDouble(v);
				4627	if (x == -1.0 && PyErr_Occurred())
				4628	return -1;
				4629	if (prec < 0)
				4630	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4631	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4632	type = 'g';
				4633	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4634	/* worst case length calc to ensure no buffer overrun:
				4635	fmt = %#.<prec>g
				4636	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4637	for any double rep.)
				4638	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4639	If prec=0 the effective precision is 1 (the leading digit is
				4640	always given), therefore increase by one to 10+prec. */
				4641	if (buflen <= (size_t)10 + (size_t)prec) {
				4642	PyErr_SetString(PyExc_OverflowError,
				4643	"formatted float is too long (precision too long?)");
				4644	return -1;
				4645	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4646	return usprintf(buf, fmt, x);
				4647	}
				4648
				4649	static int
				4650	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4651	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4652	int flags,
				4653	int prec,
				4654	int type,
				4655	PyObject *v)
				4656	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4657	/* fmt = '%#.' + `prec` + 'l' + `type`
				4658	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4659	char fmt[20];
				4660	long x;
				4661
				4662	x = PyInt_AsLong(v);
				4663	if (x == -1 && PyErr_Occurred())
				4664	return -1;
				4665	if (prec < 0)
				4666	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4667	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4668	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4669	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4670	PyErr_SetString(PyExc_OverflowError,
				4671	"formatted integer is too long (precision too long?)");
				4672	return -1;
				4673	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4674	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4675	return usprintf(buf, fmt, x);
				4676	}
				4677
				4678	static int
				4679	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4680	size_t buflen,
				4681	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4682	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4683	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4684	if (PyUnicode_Check(v)) {
				4685	if (PyUnicode_GET_SIZE(v) != 1)
				4686	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4687	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4688	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4689
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4690	else if (PyString_Check(v)) {
				4691	if (PyString_GET_SIZE(v) != 1)
				4692	goto onError;
				4693	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4694	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4695
				4696	else {
				4697	/* Integer input truncated to a character */
				4698	long x;
				4699	x = PyInt_AsLong(v);
				4700	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4701	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4702	buf[0] = (char) x;
				4703	}
				4704	buf[1] = '\0';
				4705	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4706
				4707	onError:
				4708	PyErr_SetString(PyExc_TypeError,
				4709	"%c requires int or char");
				4710	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4711	}
				4712
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4713	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4714
				4715	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4716	chars are formatted. XXX This is a magic number. Each formatting
				4717	routine does bounds checking to ensure no overflow, but a better
				4718	solution may be to malloc a buffer of appropriate size for each
				4719	format. For now, the current solution is sufficient.
				4720	*/
				4721	#define FORMATBUFLEN (size_t)120
				4722
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4723	PyObject PyUnicode_Format(PyObject format,
				4724	PyObject *args)
				4725	{
				4726	Py_UNICODE fmt, res;
				4727	int fmtcnt, rescnt, reslen, arglen, argidx;
				4728	int args_owned = 0;
				4729	PyUnicodeObject *result = NULL;
				4730	PyObject *dict = NULL;
				4731	PyObject *uformat;
				4732
				4733	if (format == NULL \|\| args == NULL) {
				4734	PyErr_BadInternalCall();
				4735	return NULL;
				4736	}
				4737	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4738	if (uformat == NULL)
				4739	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4740	fmt = PyUnicode_AS_UNICODE(uformat);
				4741	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4742
				4743	reslen = rescnt = fmtcnt + 100;
				4744	result = _PyUnicode_New(reslen);
				4745	if (result == NULL)
				4746	goto onError;
				4747	res = PyUnicode_AS_UNICODE(result);
				4748
				4749	if (PyTuple_Check(args)) {
				4750	arglen = PyTuple_Size(args);
				4751	argidx = 0;
				4752	}
				4753	else {
				4754	arglen = -1;
				4755	argidx = -2;
				4756	}
				4757	if (args->ob_type->tp_as_mapping)
				4758	dict = args;
				4759
				4760	while (--fmtcnt >= 0) {
				4761	if (*fmt != '%') {
				4762	if (--rescnt < 0) {
				4763	rescnt = fmtcnt + 100;
				4764	reslen += rescnt;
				4765	if (_PyUnicode_Resize(result, reslen) < 0)
				4766	return NULL;
				4767	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4768	--rescnt;
				4769	}
				4770	res++ = fmt++;
				4771	}
				4772	else {
				4773	/* Got a format specifier */
				4774	int flags = 0;
				4775	int width = -1;
				4776	int prec = -1;
				4777	int size = 0;
				4778	Py_UNICODE c = '\0';
				4779	Py_UNICODE fill;
				4780	PyObject *v = NULL;
				4781	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4782	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4783	Py_UNICODE sign;
				4784	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4785	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4786
				4787	fmt++;
				4788	if (*fmt == '(') {
				4789	Py_UNICODE *keystart;
				4790	int keylen;
				4791	PyObject *key;
				4792	int pcount = 1;
				4793
				4794	if (dict == NULL) {
				4795	PyErr_SetString(PyExc_TypeError,
				4796	"format requires a mapping");
				4797	goto onError;
				4798	}
				4799	++fmt;
				4800	--fmtcnt;
				4801	keystart = fmt;
				4802	/* Skip over balanced parentheses */
				4803	while (pcount > 0 && --fmtcnt >= 0) {
				4804	if (*fmt == ')')
				4805	--pcount;
				4806	else if (*fmt == '(')
				4807	++pcount;
				4808	fmt++;
				4809	}
				4810	keylen = fmt - keystart - 1;
				4811	if (fmtcnt < 0 \|\| pcount > 0) {
				4812	PyErr_SetString(PyExc_ValueError,
				4813	"incomplete format key");
				4814	goto onError;
				4815	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4816	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4817	then looked up since Python uses strings to hold
				4818	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4819	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4820	key = PyUnicode_EncodeUTF8(keystart,
				4821	keylen,
				4822	NULL);
				4823	if (key == NULL)
				4824	goto onError;
				4825	if (args_owned) {
				4826	Py_DECREF(args);
				4827	args_owned = 0;
				4828	}
				4829	args = PyObject_GetItem(dict, key);
				4830	Py_DECREF(key);
				4831	if (args == NULL) {
				4832	goto onError;
				4833	}
				4834	args_owned = 1;
				4835	arglen = -1;
				4836	argidx = -2;
				4837	}
				4838	while (--fmtcnt >= 0) {
				4839	switch (c = *fmt++) {
				4840	case '-': flags \|= F_LJUST; continue;
				4841	case '+': flags \|= F_SIGN; continue;
				4842	case ' ': flags \|= F_BLANK; continue;
				4843	case '#': flags \|= F_ALT; continue;
				4844	case '0': flags \|= F_ZERO; continue;
				4845	}
				4846	break;
				4847	}
				4848	if (c == '*') {
				4849	v = getnextarg(args, arglen, &argidx);
				4850	if (v == NULL)
				4851	goto onError;
				4852	if (!PyInt_Check(v)) {
				4853	PyErr_SetString(PyExc_TypeError,
				4854	"* wants int");
				4855	goto onError;
				4856	}
				4857	width = PyInt_AsLong(v);
				4858	if (width < 0) {
				4859	flags \|= F_LJUST;
				4860	width = -width;
				4861	}
				4862	if (--fmtcnt >= 0)
				4863	c = *fmt++;
				4864	}
				4865	else if (c >= '0' && c <= '9') {
				4866	width = c - '0';
				4867	while (--fmtcnt >= 0) {
				4868	c = *fmt++;
				4869	if (c < '0' \|\| c > '9')
				4870	break;
				4871	if ((width*10) / 10 != width) {
				4872	PyErr_SetString(PyExc_ValueError,
				4873	"width too big");
				4874	goto onError;
				4875	}
				4876	width = width*10 + (c - '0');
				4877	}
				4878	}
				4879	if (c == '.') {
				4880	prec = 0;
				4881	if (--fmtcnt >= 0)
				4882	c = *fmt++;
				4883	if (c == '*') {
				4884	v = getnextarg(args, arglen, &argidx);
				4885	if (v == NULL)
				4886	goto onError;
				4887	if (!PyInt_Check(v)) {
				4888	PyErr_SetString(PyExc_TypeError,
				4889	"* wants int");
				4890	goto onError;
				4891	}
				4892	prec = PyInt_AsLong(v);
				4893	if (prec < 0)
				4894	prec = 0;
				4895	if (--fmtcnt >= 0)
				4896	c = *fmt++;
				4897	}
				4898	else if (c >= '0' && c <= '9') {
				4899	prec = c - '0';
				4900	while (--fmtcnt >= 0) {
				4901	c = Py_CHARMASK(*fmt++);
				4902	if (c < '0' \|\| c > '9')
				4903	break;
				4904	if ((prec*10) / 10 != prec) {
				4905	PyErr_SetString(PyExc_ValueError,
				4906	"prec too big");
				4907	goto onError;
				4908	}
				4909	prec = prec*10 + (c - '0');
				4910	}
				4911	}
				4912	} /* prec */
				4913	if (fmtcnt >= 0) {
				4914	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4915	size = c;
				4916	if (--fmtcnt >= 0)
				4917	c = *fmt++;
				4918	}
				4919	}
				4920	if (fmtcnt < 0) {
				4921	PyErr_SetString(PyExc_ValueError,
				4922	"incomplete format");
				4923	goto onError;
				4924	}
				4925	if (c != '%') {
				4926	v = getnextarg(args, arglen, &argidx);
				4927	if (v == NULL)
				4928	goto onError;
				4929	}
				4930	sign = 0;
				4931	fill = ' ';
				4932	switch (c) {
				4933
				4934	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4935	pbuf = formatbuf;
				4936	/* presume that buffer length is at least 1 */
				4937	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4938	len = 1;
				4939	break;
				4940
				4941	case 's':
				4942	case 'r':
				4943	if (PyUnicode_Check(v) && c == 's') {
				4944	temp = v;
				4945	Py_INCREF(temp);
				4946	}
				4947	else {
				4948	PyObject *unicode;
				4949	if (c == 's')
				4950	temp = PyObject_Str(v);
				4951	else
				4952	temp = PyObject_Repr(v);
				4953	if (temp == NULL)
				4954	goto onError;
				4955	if (!PyString_Check(temp)) {
				4956	/* XXX Note: this should never happen, since
				4957	PyObject_Repr() and PyObject_Str() assure
				4958	this */
				4959	Py_DECREF(temp);
				4960	PyErr_SetString(PyExc_TypeError,
				4961	"%s argument has non-string str()");
				4962	goto onError;
				4963	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4964	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4965	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4966	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4967	"strict");
				4968	Py_DECREF(temp);
				4969	temp = unicode;
				4970	if (temp == NULL)
				4971	goto onError;
				4972	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4973	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4974	len = PyUnicode_GET_SIZE(temp);
				4975	if (prec >= 0 && len > prec)
				4976	len = prec;
				4977	break;
				4978
				4979	case 'i':
				4980	case 'd':
				4981	case 'u':
				4982	case 'o':
				4983	case 'x':
				4984	case 'X':
				4985	if (c == 'i')
				4986	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4987	pbuf = formatbuf;
				4988	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4989	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4990	if (len < 0)
				4991	goto onError;
				4992	sign = (c == 'd');
				4993	if (flags & F_ZERO) {
				4994	fill = '0';
				4995	if ((flags&F_ALT) &&
				4996	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4997	pbuf[0] == '0' && pbuf[1] == c) {
				4998	res++ = pbuf++;
				4999	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5000	rescnt -= 2;
				5001	len -= 2;
				5002	width -= 2;
				5003	if (width < 0)
				5004	width = 0;
				5005	}
				5006	}
				5007	break;
				5008
				5009	case 'e':
				5010	case 'E':
				5011	case 'f':
				5012	case 'g':
				5013	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5014	pbuf = formatbuf;
				5015	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5016	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5017	if (len < 0)
				5018	goto onError;
				5019	sign = 1;
				5020	if (flags&F_ZERO)
				5021	fill = '0';
				5022	break;
				5023
				5024	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5025	pbuf = formatbuf;
				5026	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5027	if (len < 0)
				5028	goto onError;
				5029	break;
				5030
				5031	default:
				5032	PyErr_Format(PyExc_ValueError,
				5033	"unsupported format character '%c' (0x%x)",
				5034	c, c);
				5035	goto onError;
				5036	}
				5037	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5038	if (pbuf == '-' \|\| pbuf == '+') {
				5039	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5040	len--;
				5041	}
				5042	else if (flags & F_SIGN)
				5043	sign = '+';
				5044	else if (flags & F_BLANK)
				5045	sign = ' ';
				5046	else
				5047	sign = 0;
				5048	}
				5049	if (width < len)
				5050	width = len;
				5051	if (rescnt < width + (sign != 0)) {
				5052	reslen -= rescnt;
				5053	rescnt = width + fmtcnt + 100;
				5054	reslen += rescnt;
				5055	if (_PyUnicode_Resize(result, reslen) < 0)
				5056	return NULL;
				5057	res = PyUnicode_AS_UNICODE(result)
				5058	+ reslen - rescnt;
				5059	}
				5060	if (sign) {
				5061	if (fill != ' ')
				5062	*res++ = sign;
				5063	rescnt--;
				5064	if (width > len)
				5065	width--;
				5066	}
				5067	if (width > len && !(flags & F_LJUST)) {
				5068	do {
				5069	--rescnt;
				5070	*res++ = fill;
				5071	} while (--width > len);
				5072	}
				5073	if (sign && fill == ' ')
				5074	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5075	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5076	res += len;
				5077	rescnt -= len;
				5078	while (--width >= len) {
				5079	--rescnt;
				5080	*res++ = ' ';
				5081	}
				5082	if (dict && (argidx < arglen) && c != '%') {
				5083	PyErr_SetString(PyExc_TypeError,
				5084	"not all arguments converted");
				5085	goto onError;
				5086	}
				5087	Py_XDECREF(temp);
				5088	} /* '%' */
				5089	} /* until end */
				5090	if (argidx < arglen && !dict) {
				5091	PyErr_SetString(PyExc_TypeError,
				5092	"not all arguments converted");
				5093	goto onError;
				5094	}
				5095
				5096	if (args_owned) {
				5097	Py_DECREF(args);
				5098	}
				5099	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5100	if (_PyUnicode_Resize(result, reslen - rescnt))
				5101	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5102	return (PyObject *)result;
				5103
				5104	onError:
				5105	Py_XDECREF(result);
				5106	Py_DECREF(uformat);
				5107	if (args_owned) {
				5108	Py_DECREF(args);
				5109	}
				5110	return NULL;
				5111	}
				5112
				5113	static PyBufferProcs unicode_as_buffer = {
				5114	(getreadbufferproc) unicode_buffer_getreadbuf,
				5115	(getwritebufferproc) unicode_buffer_getwritebuf,
				5116	(getsegcountproc) unicode_buffer_getsegcount,
				5117	(getcharbufferproc) unicode_buffer_getcharbuf,
				5118	};
				5119
				5120	PyTypeObject PyUnicode_Type = {
				5121	PyObject_HEAD_INIT(&PyType_Type)
				5122	0, /* ob_size */
				5123	"unicode", /* tp_name */
				5124	sizeof(PyUnicodeObject), /* tp_size */
				5125	0, /* tp_itemsize */
				5126	/* Slots */
				5127	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5128	0, /* tp_print */
				5129	(getattrfunc)unicode_getattr, /* tp_getattr */
				5130	0, /* tp_setattr */
				5131	(cmpfunc) unicode_compare, /* tp_compare */
				5132	(reprfunc) unicode_repr, /* tp_repr */
				5133	0, /* tp_as_number */
				5134	&unicode_as_sequence, /* tp_as_sequence */
				5135	0, /* tp_as_mapping */
				5136	(hashfunc) unicode_hash, /* tp_hash*/
				5137	0, /* tp_call*/
				5138	(reprfunc) unicode_str, /* tp_str */
				5139	(getattrofunc) NULL, /* tp_getattro */
				5140	(setattrofunc) NULL, /* tp_setattro */
				5141	&unicode_as_buffer, /* tp_as_buffer */
				5142	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5143	};
				5144
				5145	/* Initialize the Unicode implementation */
				5146
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5147	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5148	{
				5149	/* Doublecheck the configuration... */
				5150	if (sizeof(Py_UNICODE) != 2)
				5151	Py_FatalError("Unicode configuration error: "
				5152	"sizeof(Py_UNICODE) != 2 bytes");
				5153
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5154	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5155	unicode_freelist = NULL;
				5156	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5157	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5158	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5159	}
				5160
				5161	/* Finalize the Unicode implementation */
				5162
				5163	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5164	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5165	{
				5166	PyUnicodeObject *u = unicode_freelist;
				5167
				5168	while (u != NULL) {
				5169	PyUnicodeObject *v = u;
				5170	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5171	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5172	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5173	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5174	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5175	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5176	unicode_freelist = NULL;
				5177	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5178	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5179	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5180	}