Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 11146ff0dd8e7bfeac7ecac731fbc9ff8dceff8e [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	168	if (unicode->defenc) {
				169	Py_DECREF(unicode->defenc);
				170	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	216	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	217	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	219	/* Keep-Alive optimization: we only upsize the buffer,
				220	never downsize it. */
				221	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	223	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	224	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	227	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	229	}
				230	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	231	}
				232	else {
				233	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				234	if (unicode == NULL)
				235	return NULL;
				236	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				237	}
				238
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	239	if (!unicode->str) {
				240	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	241	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	unicode->str[length] = 0;
				244	unicode->length = length;
				245	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	246	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	248
				249	onError:
				250	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	251	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	252	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254
				255	static
				256	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				257	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	259	/* Keep-Alive optimization */
				260	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	261	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	unicode->str = NULL;
				263	unicode->length = 0;
				264	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	265	if (unicode->defenc) {
				266	Py_DECREF(unicode->defenc);
				267	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	268	}
				269	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	270	(PyUnicodeObject *)unicode = unicode_freelist;
				271	unicode_freelist = unicode;
				272	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	273	}
				274	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	275	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	276	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	277	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	278	}
				279	}
				280
				281	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				282	int size)
				283	{
				284	PyUnicodeObject *unicode;
				285
				286	unicode = _PyUnicode_New(size);
				287	if (!unicode)
				288	return NULL;
				289
				290	/* Copy the Unicode data into the new object */
				291	if (u != NULL)
				292	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	#ifdef HAVE_WCHAR_H
				298
				299	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				300	int size)
				301	{
				302	PyUnicodeObject *unicode;
				303
				304	if (w == NULL) {
				305	PyErr_BadInternalCall();
				306	return NULL;
				307	}
				308
				309	unicode = _PyUnicode_New(size);
				310	if (!unicode)
				311	return NULL;
				312
				313	/* Copy the wchar_t data into the new object */
				314	#ifdef HAVE_USABLE_WCHAR_T
				315	memcpy(unicode->str, w, size * sizeof(wchar_t));
				316	#else
				317	{
				318	register Py_UNICODE *u;
				319	register int i;
				320	u = PyUnicode_AS_UNICODE(unicode);
				321	for (i = size; i >= 0; i--)
				322	u++ = w++;
				323	}
				324	#endif
				325
				326	return (PyObject *)unicode;
				327	}
				328
				329	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				330	register wchar_t *w,
				331	int size)
				332	{
				333	if (unicode == NULL) {
				334	PyErr_BadInternalCall();
				335	return -1;
				336	}
				337	if (size > PyUnicode_GET_SIZE(unicode))
				338	size = PyUnicode_GET_SIZE(unicode);
				339	#ifdef HAVE_USABLE_WCHAR_T
				340	memcpy(w, unicode->str, size * sizeof(wchar_t));
				341	#else
				342	{
				343	register Py_UNICODE *u;
				344	register int i;
				345	u = PyUnicode_AS_UNICODE(unicode);
				346	for (i = size; i >= 0; i--)
				347	w++ = u++;
				348	}
				349	#endif
				350
				351	return size;
				352	}
				353
				354	#endif
				355
				356	PyObject PyUnicode_FromObject(register PyObject obj)
				357	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	358	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				359	}
				360
				361	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				362	const char *encoding,
				363	const char *errors)
				364	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	365	const char *s;
				366	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	367	int owned = 0;
				368	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	369
				370	if (obj == NULL) {
				371	PyErr_BadInternalCall();
				372	return NULL;
				373	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	374
				375	/* Coerce object */
				376	if (PyInstance_Check(obj)) {
				377	PyObject *func;
				378	func = PyObject_GetAttrString(obj, "__str__");
				379	if (func == NULL) {
				380	PyErr_SetString(PyExc_TypeError,
				381	"coercing to Unicode: instance doesn't define __str__");
				382	return NULL;
				383	}
				384	obj = PyEval_CallObject(func, NULL);
				385	Py_DECREF(func);
				386	if (obj == NULL)
				387	return NULL;
				388	owned = 1;
				389	}
				390	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	391	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	392	v = obj;
				393	if (encoding) {
				394	PyErr_SetString(PyExc_TypeError,
				395	"decoding Unicode is not supported");
				396	return NULL;
				397	}
				398	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399	}
				400	else if (PyString_Check(obj)) {
				401	s = PyString_AS_STRING(obj);
				402	len = PyString_GET_SIZE(obj);
				403	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	404	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				405	/* Overwrite the error message with something more useful in
				406	case of a TypeError. */
				407	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	408	PyErr_Format(PyExc_TypeError,
				409	"coercing to Unicode: need string or buffer, "
				410	"%.80s found",
				411	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414
				415	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	if (len == 0) {
				417	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	else
				421	v = PyUnicode_Decode(s, len, encoding, errors);
				422	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return v;
				427
				428	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	429	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	430	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	431	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	432	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	433	}
				434
				435	PyObject PyUnicode_Decode(const char s,
				436	int size,
				437	const char *encoding,
				438	const char *errors)
				439	{
				440	PyObject buffer = NULL, unicode;
				441
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	if (encoding == NULL)
				443	encoding = PyUnicode_GetDefaultEncoding();
				444
				445	/* Shortcuts for common default encodings */
				446	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	447	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	448	else if (strcmp(encoding, "latin-1") == 0)
				449	return PyUnicode_DecodeLatin1(s, size, errors);
				450	else if (strcmp(encoding, "ascii") == 0)
				451	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452
				453	/* Decode via the codec registry */
				454	buffer = PyBuffer_FromMemory((void *)s, size);
				455	if (buffer == NULL)
				456	goto onError;
				457	unicode = PyCodec_Decode(buffer, encoding, errors);
				458	if (unicode == NULL)
				459	goto onError;
				460	if (!PyUnicode_Check(unicode)) {
				461	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	462	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	463	unicode->ob_type->tp_name);
				464	Py_DECREF(unicode);
				465	goto onError;
				466	}
				467	Py_DECREF(buffer);
				468	return unicode;
				469
				470	onError:
				471	Py_XDECREF(buffer);
				472	return NULL;
				473	}
				474
				475	PyObject PyUnicode_Encode(const Py_UNICODE s,
				476	int size,
				477	const char *encoding,
				478	const char *errors)
				479	{
				480	PyObject v, unicode;
				481
				482	unicode = PyUnicode_FromUnicode(s, size);
				483	if (unicode == NULL)
				484	return NULL;
				485	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				486	Py_DECREF(unicode);
				487	return v;
				488	}
				489
				490	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				491	const char *encoding,
				492	const char *errors)
				493	{
				494	PyObject *v;
				495
				496	if (!PyUnicode_Check(unicode)) {
				497	PyErr_BadArgument();
				498	goto onError;
				499	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	500
				501	if (encoding == NULL)
				502	encoding = PyUnicode_GetDefaultEncoding();
				503
				504	/* Shortcuts for common default encodings */
				505	if (errors == NULL) {
				506	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	508	else if (strcmp(encoding, "latin-1") == 0)
				509	return PyUnicode_AsLatin1String(unicode);
				510	else if (strcmp(encoding, "ascii") == 0)
				511	return PyUnicode_AsASCIIString(unicode);
				512	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	513
				514	/* Encode via the codec registry */
				515	v = PyCodec_Encode(unicode, encoding, errors);
				516	if (v == NULL)
				517	goto onError;
				518	/* XXX Should we really enforce this ? */
				519	if (!PyString_Check(v)) {
				520	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	521	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	522	v->ob_type->tp_name);
				523	Py_DECREF(v);
				524	goto onError;
				525	}
				526	return v;
				527
				528	onError:
				529	return NULL;
				530	}
				531
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	532	/* Return a Python string holding the default encoded value of the
				533	Unicode object.
				534
				535	The resulting string is cached in the Unicode object for subsequent
				536	usage by this function. The cached version is needed to implement
				537	the character buffer interface and will live (at least) as long as
				538	the Unicode object itself.
				539
				540	The refcount of the string is not incremented.
				541
				542	* Exported for internal use by the interpreter only !!! *
				543
				544	*/
				545
				546	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				547	const char *errors)
				548	{
				549	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				550
				551	if (v)
				552	return v;
				553	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				554	if (v && errors == NULL)
				555	((PyUnicodeObject *)unicode)->defenc = v;
				556	return v;
				557	}
				558
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				560	{
				561	if (!PyUnicode_Check(unicode)) {
				562	PyErr_BadArgument();
				563	goto onError;
				564	}
				565	return PyUnicode_AS_UNICODE(unicode);
				566
				567	onError:
				568	return NULL;
				569	}
				570
				571	int PyUnicode_GetSize(PyObject *unicode)
				572	{
				573	if (!PyUnicode_Check(unicode)) {
				574	PyErr_BadArgument();
				575	goto onError;
				576	}
				577	return PyUnicode_GET_SIZE(unicode);
				578
				579	onError:
				580	return -1;
				581	}
				582
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	583	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	584	{
				585	return unicode_default_encoding;
				586	}
				587
				588	int PyUnicode_SetDefaultEncoding(const char *encoding)
				589	{
				590	PyObject *v;
				591
				592	/* Make sure the encoding is valid. As side effect, this also
				593	loads the encoding into the codec registry cache. */
				594	v = _PyCodec_Lookup(encoding);
				595	if (v == NULL)
				596	goto onError;
				597	Py_DECREF(v);
				598	strncpy(unicode_default_encoding,
				599	encoding,
				600	sizeof(unicode_default_encoding));
				601	return 0;
				602
				603	onError:
				604	return -1;
				605	}
				606
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	607	/* --- UTF-8 Codec -------------------------------------------------------- */
				608
				609	static
				610	char utf8_code_length[256] = {
				611	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				612	illegal prefix. see RFC 2279 for details */
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				616	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				617	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				618	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				619	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				620	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				621	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				622	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				623	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				624	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				625	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				626	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				627	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				628	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				629	};
				630
				631	static
				632	int utf8_decoding_error(const char **source,
				633	Py_UNICODE **dest,
				634	const char *errors,
				635	const char *details)
				636	{
				637	if ((errors == NULL) \|\|
				638	(strcmp(errors,"strict") == 0)) {
				639	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	640	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	641	details);
				642	return -1;
				643	}
				644	else if (strcmp(errors,"ignore") == 0) {
				645	(*source)++;
				646	return 0;
				647	}
				648	else if (strcmp(errors,"replace") == 0) {
				649	(*source)++;
				650	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				651	(*dest)++;
				652	return 0;
				653	}
				654	else {
				655	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	656	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	657	errors);
				658	return -1;
				659	}
				660	}
				661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	662	PyObject PyUnicode_DecodeUTF8(const char s,
				663	int size,
				664	const char *errors)
				665	{
				666	int n;
				667	const char *e;
				668	PyUnicodeObject *unicode;
				669	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	670	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	671
				672	/* Note: size will always be longer than the resulting Unicode
				673	character count */
				674	unicode = _PyUnicode_New(size);
				675	if (!unicode)
				676	return NULL;
				677	if (size == 0)
				678	return (PyObject *)unicode;
				679
				680	/* Unpack UTF-8 encoded data */
				681	p = unicode->str;
				682	e = s + size;
				683
				684	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	685	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686
				687	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	688	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	s++;
				690	continue;
				691	}
				692
				693	n = utf8_code_length[ch];
				694
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	695	if (s + n > e) {
				696	errmsg = "unexpected end of data";
				697	goto utf8Error;
				698	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699
				700	switch (n) {
				701
				702	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	703	errmsg = "unexpected code byte";
				704	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	705	break;
				706
				707	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	708	errmsg = "internal error";
				709	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	710	break;
				711
				712	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	if ((s[1] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	717	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	if (ch < 0x80) {
				719	errmsg = "illegal encoding";
				720	goto utf8Error;
				721	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	722	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	723	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	724	break;
				725
				726	case 3:
				727	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	728	(s[2] & 0xc0) != 0x80) {
				729	errmsg = "invalid data";
				730	goto utf8Error;
				731	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	732	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	733	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				734	errmsg = "illegal encoding";
				735	goto utf8Error;
				736	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	738	*p++ = (Py_UNICODE)ch;
				739	break;
				740
				741	case 4:
				742	if ((s[1] & 0xc0) != 0x80 \|\|
				743	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	(s[3] & 0xc0) != 0x80) {
				745	errmsg = "invalid data";
				746	goto utf8Error;
				747	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	748	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				749	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				750	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	751	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				752	byte encoding */
				753	(ch > 0x10ffff)) { /* maximum value allowed for
				754	UTF-16 */
				755	errmsg = "illegal encoding";
				756	goto utf8Error;
				757	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	758	/* compute and append the two surrogates: */
				759
				760	/* translate from 10000..10FFFF to 0..FFFF */
				761	ch -= 0x10000;
				762
				763	/* high surrogate = top 10 bits added to D800 */
				764	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				765
				766	/* low surrogate = bottom 10 bits added to DC00 */
				767	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	768	break;
				769
				770	default:
				771	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	772	errmsg = "unsupported Unicode code range";
				773	goto utf8Error;
				774	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775	}
				776	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	777	continue;
				778
				779	utf8Error:
				780	if (utf8_decoding_error(&s, &p, errors, errmsg))
				781	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	782	}
				783
				784	/* Adjust length */
				785	if (_PyUnicode_Resize(unicode, p - unicode->str))
				786	goto onError;
				787
				788	return (PyObject *)unicode;
				789
				790	onError:
				791	Py_DECREF(unicode);
				792	return NULL;
				793	}
				794
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	795	/* Not used anymore, now that the encoder supports UTF-16
				796	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	797	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	798	static
				799	int utf8_encoding_error(const Py_UNICODE **source,
				800	char **dest,
				801	const char *errors,
				802	const char *details)
				803	{
				804	if ((errors == NULL) \|\|
				805	(strcmp(errors,"strict") == 0)) {
				806	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	807	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	details);
				809	return -1;
				810	}
				811	else if (strcmp(errors,"ignore") == 0) {
				812	return 0;
				813	}
				814	else if (strcmp(errors,"replace") == 0) {
				815	**dest = '?';
				816	(*dest)++;
				817	return 0;
				818	}
				819	else {
				820	PyErr_Format(PyExc_ValueError,
				821	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	822	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	errors);
				824	return -1;
				825	}
				826	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	827	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	828
				829	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				830	int size,
				831	const char *errors)
				832	{
				833	PyObject *v;
				834	char *p;
				835	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	836	Py_UCS4 ch2;
				837	unsigned int cbAllocated = 3 * size;
				838	unsigned int cbWritten = 0;
				839	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	842	if (v == NULL)
				843	return NULL;
				844	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	845	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	846
				847	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	while (i < size) {
				849	Py_UCS4 ch = s[i++];
				850	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	851	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	852	cbWritten++;
				853	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	854	else if (ch < 0x0800) {
				855	*p++ = 0xc0 \| (ch >> 6);
				856	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	857	cbWritten += 2;
				858	}
				859	else {
				860	/* Check for high surrogate */
				861	if (0xD800 <= ch && ch <= 0xDBFF) {
				862	if (i != size) {
				863	ch2 = s[i];
				864	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				865
				866	if (cbWritten >= (cbAllocated - 4)) {
				867	/* Provide enough room for some more
				868	surrogates */
				869	cbAllocated += 4*10;
				870	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	871	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	872	}
				873
				874	/* combine the two values */
				875	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				876
				877	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	878	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	879	i++;
				880	cbWritten += 4;
				881	}
				882	}
				883	}
				884	else {
				885	*p++ = (char)(0xe0 \| (ch >> 12));
				886	cbWritten += 3;
				887	}
				888	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				889	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	}
				891	}
				892	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	893	if (_PyString_Resize(&v, p - q))
				894	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	895	return v;
				896
				897	onError:
				898	Py_DECREF(v);
				899	return NULL;
				900	}
				901
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	902	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				903	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	904	if (!PyUnicode_Check(unicode)) {
				905	PyErr_BadArgument();
				906	return NULL;
				907	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	908	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				909	PyUnicode_GET_SIZE(unicode),
				910	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	911	}
				912
				913	/* --- UTF-16 Codec ------------------------------------------------------- */
				914
				915	static
				916	int utf16_decoding_error(const Py_UNICODE **source,
				917	Py_UNICODE **dest,
				918	const char *errors,
				919	const char *details)
				920	{
				921	if ((errors == NULL) \|\|
				922	(strcmp(errors,"strict") == 0)) {
				923	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	924	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	925	details);
				926	return -1;
				927	}
				928	else if (strcmp(errors,"ignore") == 0) {
				929	return 0;
				930	}
				931	else if (strcmp(errors,"replace") == 0) {
				932	if (dest) {
				933	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				934	(*dest)++;
				935	}
				936	return 0;
				937	}
				938	else {
				939	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	940	"UTF-16 decoding error; "
				941	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	942	errors);
				943	return -1;
				944	}
				945	}
				946
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	947	PyObject PyUnicode_DecodeUTF16(const char s,
				948	int size,
				949	const char *errors,
				950	int *byteorder)
				951	{
				952	PyUnicodeObject *unicode;
				953	Py_UNICODE *p;
				954	const Py_UNICODE q, e;
				955	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	956	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	957
				958	/* size should be an even number */
				959	if (size % sizeof(Py_UNICODE) != 0) {
				960	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				961	return NULL;
				962	/* The remaining input chars are ignored if we fall through
				963	here... */
				964	}
				965
				966	/* Note: size will always be longer than the resulting Unicode
				967	character count */
				968	unicode = _PyUnicode_New(size);
				969	if (!unicode)
				970	return NULL;
				971	if (size == 0)
				972	return (PyObject *)unicode;
				973
				974	/* Unpack UTF-16 encoded data */
				975	p = unicode->str;
				976	q = (Py_UNICODE *)s;
				977	e = q + (size / sizeof(Py_UNICODE));
				978
				979	if (byteorder)
				980	bo = *byteorder;
				981
				982	while (q < e) {
				983	register Py_UNICODE ch = *q++;
				984
				985	/* Check for BOM marks (U+FEFF) in the input and adjust
				986	current byte order setting accordingly. Swap input
				987	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				988	!) */
				989	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				990	if (ch == 0xFEFF) {
				991	bo = -1;
				992	continue;
				993	} else if (ch == 0xFFFE) {
				994	bo = 1;
				995	continue;
				996	}
				997	if (bo == 1)
				998	ch = (ch >> 8) \| (ch << 8);
				999	#else
				1000	if (ch == 0xFEFF) {
				1001	bo = 1;
				1002	continue;
				1003	} else if (ch == 0xFFFE) {
				1004	bo = -1;
				1005	continue;
				1006	}
				1007	if (bo == -1)
				1008	ch = (ch >> 8) \| (ch << 8);
				1009	#endif
				1010	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1011	*p++ = ch;
				1012	continue;
				1013	}
				1014
				1015	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1016	if (q >= e) {
				1017	errmsg = "unexpected end of data";
				1018	goto utf16Error;
				1019	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1020	if (0xDC00 <= q && q <= 0xDFFF) {
				1021	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1022	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023	/* This is valid data (a UTF-16 surrogate pair), but
				1024	we are not able to store this information since our
				1025	Py_UNICODE type only has 16 bits... this might
				1026	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1027	errmsg = "code pairs are not supported";
				1028	goto utf16Error;
				1029	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1030	else
				1031	continue;
				1032	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1033	errmsg = "illegal encoding";
				1034	/* Fall through to report the error */
				1035
				1036	utf16Error:
				1037	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1038	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1039	}
				1040
				1041	if (byteorder)
				1042	*byteorder = bo;
				1043
				1044	/* Adjust length */
				1045	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1046	goto onError;
				1047
				1048	return (PyObject *)unicode;
				1049
				1050	onError:
				1051	Py_DECREF(unicode);
				1052	return NULL;
				1053	}
				1054
				1055	#undef UTF16_ERROR
				1056
				1057	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1058	int size,
				1059	const char *errors,
				1060	int byteorder)
				1061	{
				1062	PyObject *v;
				1063	Py_UNICODE *p;
				1064	char *q;
				1065
				1066	/* We don't create UTF-16 pairs... */
				1067	v = PyString_FromStringAndSize(NULL,
				1068	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1069	if (v == NULL)
				1070	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1071
				1072	q = PyString_AS_STRING(v);
				1073	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1074	if (byteorder == 0)
				1075	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1076	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1077	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1078	if (byteorder == 0 \|\|
				1079	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1080	byteorder == -1
				1081	#else
				1082	byteorder == 1
				1083	#endif
				1084	)
				1085	memcpy(p, s, size * sizeof(Py_UNICODE));
				1086	else
				1087	while (size-- > 0) {
				1088	Py_UNICODE ch = *s++;
				1089	*p++ = (ch >> 8) \| (ch << 8);
				1090	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1091	return v;
				1092	}
				1093
				1094	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1095	{
				1096	if (!PyUnicode_Check(unicode)) {
				1097	PyErr_BadArgument();
				1098	return NULL;
				1099	}
				1100	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1101	PyUnicode_GET_SIZE(unicode),
				1102	NULL,
				1103	0);
				1104	}
				1105
				1106	/* --- Unicode Escape Codec ----------------------------------------------- */
				1107
				1108	static
				1109	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1110	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1111	const char *errors,
				1112	const char *details)
				1113	{
				1114	if ((errors == NULL) \|\|
				1115	(strcmp(errors,"strict") == 0)) {
				1116	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1117	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1118	details);
				1119	return -1;
				1120	}
				1121	else if (strcmp(errors,"ignore") == 0) {
				1122	return 0;
				1123	}
				1124	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1125	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1126	return 0;
				1127	}
				1128	else {
				1129	PyErr_Format(PyExc_ValueError,
				1130	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1131	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	errors);
				1133	return -1;
				1134	}
				1135	}
				1136
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1137	static _Py_UCNHashAPI *pucnHash = NULL;
				1138
				1139	static
				1140	int mystrnicmp(const char s1, const char s2, size_t count)
				1141	{
				1142	char c1, c2;
				1143
				1144	if (count)
				1145	{
				1146	do
				1147	{
				1148	c1 = tolower(*(s1++));
				1149	c2 = tolower(*(s2++));
				1150	}
				1151	while(--count && c1 == c2);
				1152
				1153	return c1 - c2;
				1154	}
				1155
				1156	return 0;
				1157	}
				1158
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1160	int size,
				1161	const char *errors)
				1162	{
				1163	PyUnicodeObject *v;
				1164	Py_UNICODE p = NULL, buf = NULL;
				1165	const char *end;
				1166
				1167	/* Escaped strings will always be longer than the resulting
				1168	Unicode string, so we start with size here and then reduce the
				1169	length after conversion to the true value. */
				1170	v = _PyUnicode_New(size);
				1171	if (v == NULL)
				1172	goto onError;
				1173	if (size == 0)
				1174	return (PyObject *)v;
				1175	p = buf = PyUnicode_AS_UNICODE(v);
				1176	end = s + size;
				1177	while (s < end) {
				1178	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1179	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1180	int i;
				1181
				1182	/* Non-escape characters are interpreted as Unicode ordinals */
				1183	if (*s != '\\') {
				1184	p++ = (unsigned char)s++;
				1185	continue;
				1186	}
				1187
				1188	/* \ - Escapes */
				1189	s++;
				1190	switch (*s++) {
				1191
				1192	/* \x escapes */
				1193	case '\n': break;
				1194	case '\\': *p++ = '\\'; break;
				1195	case '\'': *p++ = '\''; break;
				1196	case '\"': *p++ = '\"'; break;
				1197	case 'b': *p++ = '\b'; break;
				1198	case 'f': p++ = '\014'; break; / FF */
				1199	case 't': *p++ = '\t'; break;
				1200	case 'n': *p++ = '\n'; break;
				1201	case 'r': *p++ = '\r'; break;
				1202	case 'v': p++ = '\013'; break; / VT */
				1203	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1204
				1205	/* \OOO (octal) escapes */
				1206	case '0': case '1': case '2': case '3':
				1207	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1208	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1209	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1210	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1211	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1212	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1214	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1215	break;
				1216
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1217	/* \xXXXX escape with 1-n hex digits. for compatibility
				1218	with 8-bit strings, this code ignores all but the last
				1219	two digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1220	case 'x':
				1221	x = 0;
				1222	c = (unsigned char)*s;
				1223	if (isxdigit(c)) {
				1224	do {
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1225	x = (x<<4) & 0xF0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1226	if ('0' <= c && c <= '9')
				1227	x += c - '0';
				1228	else if ('a' <= c && c <= 'f')
				1229	x += 10 + c - 'a';
				1230	else
				1231	x += 10 + c - 'A';
				1232	c = (unsigned char)*++s;
				1233	} while (isxdigit(c));
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1234	*p++ = (unsigned char) x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1235	} else {
				1236	*p++ = '\\';
				1237	*p++ = (unsigned char)s[-1];
				1238	}
				1239	break;
				1240
				1241	/* \uXXXX with 4 hex digits */
				1242	case 'u':
				1243	for (x = 0, i = 0; i < 4; i++) {
				1244	c = (unsigned char)s[i];
				1245	if (!isxdigit(c)) {
				1246	if (unicodeescape_decoding_error(&s, &x, errors,
				1247	"truncated \\uXXXX"))
				1248	goto onError;
				1249	i++;
				1250	break;
				1251	}
				1252	x = (x<<4) & ~0xF;
				1253	if (c >= '0' && c <= '9')
				1254	x += c - '0';
				1255	else if (c >= 'a' && c <= 'f')
				1256	x += 10 + c - 'a';
				1257	else
				1258	x += 10 + c - 'A';
				1259	}
				1260	s += i;
				1261	*p++ = x;
				1262	break;
				1263
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1264	case 'N':
				1265	/* Ok, we need to deal with Unicode Character Names now,
				1266	* make sure we've imported the hash table data...
				1267	*/
				1268	if (pucnHash == NULL)
				1269	{
				1270	PyObject mod = 0, v = 0;
				1271
				1272	mod = PyImport_ImportModule("ucnhash");
				1273	if (mod == NULL)
				1274	goto onError;
				1275	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1276	Py_DECREF(mod);
				1277	if (v == NULL)
				1278	{
				1279	goto onError;
				1280	}
				1281	pucnHash = PyCObject_AsVoidPtr(v);
				1282	Py_DECREF(v);
				1283	if (pucnHash == NULL)
				1284	{
				1285	goto onError;
				1286	}
				1287	}
				1288
				1289	if (*s == '{')
				1290	{
				1291	const char *start = s + 1;
				1292	const char *endBrace = start;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1293	Py_UCS4 value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1294	unsigned long j;
				1295
				1296	/* look for either the closing brace, or we
				1297	* exceed the maximum length of the unicode character names
				1298	*/
				1299	while (*endBrace != '}' &&
				1300	(unsigned int)(endBrace - start) <=
				1301	pucnHash->cchMax &&
				1302	endBrace < end)
				1303	{
				1304	endBrace++;
				1305	}
				1306	if (endBrace != end && *endBrace == '}')
				1307	{
				1308	j = pucnHash->hash(start, endBrace - start);
				1309	if (j > pucnHash->cKeys \|\|
				1310	mystrnicmp(
				1311	start,
				1312	((_Py_UnicodeCharacterName *)
				1313	(pucnHash->getValue(j)))->pszUCN,
				1314	(int)(endBrace - start)) != 0)
				1315	{
				1316	if (unicodeescape_decoding_error(
				1317	&s, &x, errors,
				1318	"Invalid Unicode Character Name"))
				1319	{
				1320	goto onError;
				1321	}
				1322	goto ucnFallthrough;
				1323	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1324	value = ((_Py_UnicodeCharacterName *)
				1325	(pucnHash->getValue(j)))->value;
				1326	if (value < 1<<16)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1327	{
				1328	/* In UCS-2 range, easy solution.. */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1329	*p++ = value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1330	}
				1331	else
				1332	{
				1333	/* Oops, its in UCS-4 space, */
				1334	/* compute and append the two surrogates: */
				1335	/* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1336	value -= 0x10000;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1337
				1338	/* high surrogate = top 10 bits added to D800 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1339	*p++ = 0xD800 + (value >> 10);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1340
				1341	/* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1342	*p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1343	}
				1344	s = endBrace + 1;
				1345	}
				1346	else
				1347	{
				1348	if (unicodeescape_decoding_error(
				1349	&s, &x, errors,
				1350	"Unicode name missing closing brace"))
				1351	goto onError;
				1352	goto ucnFallthrough;
				1353	}
				1354	break;
				1355	}
				1356	if (unicodeescape_decoding_error(
				1357	&s, &x, errors,
				1358	"Missing opening brace for Unicode Character Name escape"))
				1359	goto onError;
				1360	ucnFallthrough:
				1361	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1362	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1363	*p++ = '\\';
				1364	*p++ = (unsigned char)s[-1];
				1365	break;
				1366	}
				1367	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1368	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1369	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1370	return (PyObject *)v;
				1371
				1372	onError:
				1373	Py_XDECREF(v);
				1374	return NULL;
				1375	}
				1376
				1377	/* Return a Unicode-Escape string version of the Unicode object.
				1378
				1379	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1380	appropriate.
				1381
				1382	*/
				1383
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1384	static const Py_UNICODE findchar(const Py_UNICODE s,
				1385	int size,
				1386	Py_UNICODE ch);
				1387
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1388	static
				1389	PyObject unicodeescape_string(const Py_UNICODE s,
				1390	int size,
				1391	int quotes)
				1392	{
				1393	PyObject *repr;
				1394	char *p;
				1395	char *q;
				1396
				1397	static const char *hexdigit = "0123456789ABCDEF";
				1398
				1399	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1400	if (repr == NULL)
				1401	return NULL;
				1402
				1403	p = q = PyString_AS_STRING(repr);
				1404
				1405	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1406	*p++ = 'u';
				1407	*p++ = (findchar(s, size, '\'') &&
				1408	!findchar(s, size, '"')) ? '"' : '\'';
				1409	}
				1410	while (size-- > 0) {
				1411	Py_UNICODE ch = *s++;
				1412	/* Escape quotes */
				1413	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1414	*p++ = '\\';
				1415	*p++ = (char) ch;
				1416	}
				1417	/* Map 16-bit characters to '\uxxxx' */
				1418	else if (ch >= 256) {
				1419	*p++ = '\\';
				1420	*p++ = 'u';
				1421	*p++ = hexdigit[(ch >> 12) & 0xf];
				1422	*p++ = hexdigit[(ch >> 8) & 0xf];
				1423	*p++ = hexdigit[(ch >> 4) & 0xf];
				1424	*p++ = hexdigit[ch & 15];
				1425	}
				1426	/* Map non-printable US ASCII to '\ooo' */
				1427	else if (ch < ' ' \|\| ch >= 128) {
				1428	*p++ = '\\';
				1429	*p++ = hexdigit[(ch >> 6) & 7];
				1430	*p++ = hexdigit[(ch >> 3) & 7];
				1431	*p++ = hexdigit[ch & 7];
				1432	}
				1433	/* Copy everything else as-is */
				1434	else
				1435	*p++ = (char) ch;
				1436	}
				1437	if (quotes)
				1438	*p++ = q[1];
				1439
				1440	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1441	if (_PyString_Resize(&repr, p - q))
				1442	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1443
				1444	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1445
				1446	onError:
				1447	Py_DECREF(repr);
				1448	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1449	}
				1450
				1451	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1452	int size)
				1453	{
				1454	return unicodeescape_string(s, size, 0);
				1455	}
				1456
				1457	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1458	{
				1459	if (!PyUnicode_Check(unicode)) {
				1460	PyErr_BadArgument();
				1461	return NULL;
				1462	}
				1463	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1464	PyUnicode_GET_SIZE(unicode));
				1465	}
				1466
				1467	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1468
				1469	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1470	int size,
				1471	const char *errors)
				1472	{
				1473	PyUnicodeObject *v;
				1474	Py_UNICODE p, buf;
				1475	const char *end;
				1476	const char *bs;
				1477
				1478	/* Escaped strings will always be longer than the resulting
				1479	Unicode string, so we start with size here and then reduce the
				1480	length after conversion to the true value. */
				1481	v = _PyUnicode_New(size);
				1482	if (v == NULL)
				1483	goto onError;
				1484	if (size == 0)
				1485	return (PyObject *)v;
				1486	p = buf = PyUnicode_AS_UNICODE(v);
				1487	end = s + size;
				1488	while (s < end) {
				1489	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1490	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1491	int i;
				1492
				1493	/* Non-escape characters are interpreted as Unicode ordinals */
				1494	if (*s != '\\') {
				1495	p++ = (unsigned char)s++;
				1496	continue;
				1497	}
				1498
				1499	/* \u-escapes are only interpreted iff the number of leading
				1500	backslashes if odd */
				1501	bs = s;
				1502	for (;s < end;) {
				1503	if (*s != '\\')
				1504	break;
				1505	p++ = (unsigned char)s++;
				1506	}
				1507	if (((s - bs) & 1) == 0 \|\|
				1508	s >= end \|\|
				1509	*s != 'u') {
				1510	continue;
				1511	}
				1512	p--;
				1513	s++;
				1514
				1515	/* \uXXXX with 4 hex digits */
				1516	for (x = 0, i = 0; i < 4; i++) {
				1517	c = (unsigned char)s[i];
				1518	if (!isxdigit(c)) {
				1519	if (unicodeescape_decoding_error(&s, &x, errors,
				1520	"truncated \\uXXXX"))
				1521	goto onError;
				1522	i++;
				1523	break;
				1524	}
				1525	x = (x<<4) & ~0xF;
				1526	if (c >= '0' && c <= '9')
				1527	x += c - '0';
				1528	else if (c >= 'a' && c <= 'f')
				1529	x += 10 + c - 'a';
				1530	else
				1531	x += 10 + c - 'A';
				1532	}
				1533	s += i;
				1534	*p++ = x;
				1535	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1536	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1537	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1538	return (PyObject *)v;
				1539
				1540	onError:
				1541	Py_XDECREF(v);
				1542	return NULL;
				1543	}
				1544
				1545	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1546	int size)
				1547	{
				1548	PyObject *repr;
				1549	char *p;
				1550	char *q;
				1551
				1552	static const char *hexdigit = "0123456789ABCDEF";
				1553
				1554	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1555	if (repr == NULL)
				1556	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1557	if (size == 0)
				1558	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1559
				1560	p = q = PyString_AS_STRING(repr);
				1561	while (size-- > 0) {
				1562	Py_UNICODE ch = *s++;
				1563	/* Map 16-bit characters to '\uxxxx' */
				1564	if (ch >= 256) {
				1565	*p++ = '\\';
				1566	*p++ = 'u';
				1567	*p++ = hexdigit[(ch >> 12) & 0xf];
				1568	*p++ = hexdigit[(ch >> 8) & 0xf];
				1569	*p++ = hexdigit[(ch >> 4) & 0xf];
				1570	*p++ = hexdigit[ch & 15];
				1571	}
				1572	/* Copy everything else as-is */
				1573	else
				1574	*p++ = (char) ch;
				1575	}
				1576	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1577	if (_PyString_Resize(&repr, p - q))
				1578	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1579
				1580	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1581
				1582	onError:
				1583	Py_DECREF(repr);
				1584	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1585	}
				1586
				1587	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1588	{
				1589	if (!PyUnicode_Check(unicode)) {
				1590	PyErr_BadArgument();
				1591	return NULL;
				1592	}
				1593	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1594	PyUnicode_GET_SIZE(unicode));
				1595	}
				1596
				1597	/* --- Latin-1 Codec ------------------------------------------------------ */
				1598
				1599	PyObject PyUnicode_DecodeLatin1(const char s,
				1600	int size,
				1601	const char *errors)
				1602	{
				1603	PyUnicodeObject *v;
				1604	Py_UNICODE *p;
				1605
				1606	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1607	v = _PyUnicode_New(size);
				1608	if (v == NULL)
				1609	goto onError;
				1610	if (size == 0)
				1611	return (PyObject *)v;
				1612	p = PyUnicode_AS_UNICODE(v);
				1613	while (size-- > 0)
				1614	p++ = (unsigned char)s++;
				1615	return (PyObject *)v;
				1616
				1617	onError:
				1618	Py_XDECREF(v);
				1619	return NULL;
				1620	}
				1621
				1622	static
				1623	int latin1_encoding_error(const Py_UNICODE **source,
				1624	char **dest,
				1625	const char *errors,
				1626	const char *details)
				1627	{
				1628	if ((errors == NULL) \|\|
				1629	(strcmp(errors,"strict") == 0)) {
				1630	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1631	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1632	details);
				1633	return -1;
				1634	}
				1635	else if (strcmp(errors,"ignore") == 0) {
				1636	return 0;
				1637	}
				1638	else if (strcmp(errors,"replace") == 0) {
				1639	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1640	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1641	return 0;
				1642	}
				1643	else {
				1644	PyErr_Format(PyExc_ValueError,
				1645	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1646	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1647	errors);
				1648	return -1;
				1649	}
				1650	}
				1651
				1652	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1653	int size,
				1654	const char *errors)
				1655	{
				1656	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1657	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1658
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1659	repr = PyString_FromStringAndSize(NULL, size);
				1660	if (repr == NULL)
				1661	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1662	if (size == 0)
				1663	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1664
				1665	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1666	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1667	while (size-- > 0) {
				1668	Py_UNICODE ch = *p++;
				1669	if (ch >= 256) {
				1670	if (latin1_encoding_error(&p, &s, errors,
				1671	"ordinal not in range(256)"))
				1672	goto onError;
				1673	}
				1674	else
				1675	*s++ = (char)ch;
				1676	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1677	/* Resize if error handling skipped some characters */
				1678	if (s - start < PyString_GET_SIZE(repr))
				1679	if (_PyString_Resize(&repr, s - start))
				1680	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1681	return repr;
				1682
				1683	onError:
				1684	Py_DECREF(repr);
				1685	return NULL;
				1686	}
				1687
				1688	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1689	{
				1690	if (!PyUnicode_Check(unicode)) {
				1691	PyErr_BadArgument();
				1692	return NULL;
				1693	}
				1694	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1695	PyUnicode_GET_SIZE(unicode),
				1696	NULL);
				1697	}
				1698
				1699	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1700
				1701	static
				1702	int ascii_decoding_error(const char **source,
				1703	Py_UNICODE **dest,
				1704	const char *errors,
				1705	const char *details)
				1706	{
				1707	if ((errors == NULL) \|\|
				1708	(strcmp(errors,"strict") == 0)) {
				1709	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1710	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1711	details);
				1712	return -1;
				1713	}
				1714	else if (strcmp(errors,"ignore") == 0) {
				1715	return 0;
				1716	}
				1717	else if (strcmp(errors,"replace") == 0) {
				1718	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1719	(*dest)++;
				1720	return 0;
				1721	}
				1722	else {
				1723	PyErr_Format(PyExc_ValueError,
				1724	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1725	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1726	errors);
				1727	return -1;
				1728	}
				1729	}
				1730
				1731	PyObject PyUnicode_DecodeASCII(const char s,
				1732	int size,
				1733	const char *errors)
				1734	{
				1735	PyUnicodeObject *v;
				1736	Py_UNICODE *p;
				1737
				1738	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1739	v = _PyUnicode_New(size);
				1740	if (v == NULL)
				1741	goto onError;
				1742	if (size == 0)
				1743	return (PyObject *)v;
				1744	p = PyUnicode_AS_UNICODE(v);
				1745	while (size-- > 0) {
				1746	register unsigned char c;
				1747
				1748	c = (unsigned char)*s++;
				1749	if (c < 128)
				1750	*p++ = c;
				1751	else if (ascii_decoding_error(&s, &p, errors,
				1752	"ordinal not in range(128)"))
				1753	goto onError;
				1754	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1755	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1756	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1757	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1758	return (PyObject *)v;
				1759
				1760	onError:
				1761	Py_XDECREF(v);
				1762	return NULL;
				1763	}
				1764
				1765	static
				1766	int ascii_encoding_error(const Py_UNICODE **source,
				1767	char **dest,
				1768	const char *errors,
				1769	const char *details)
				1770	{
				1771	if ((errors == NULL) \|\|
				1772	(strcmp(errors,"strict") == 0)) {
				1773	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1774	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1775	details);
				1776	return -1;
				1777	}
				1778	else if (strcmp(errors,"ignore") == 0) {
				1779	return 0;
				1780	}
				1781	else if (strcmp(errors,"replace") == 0) {
				1782	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1783	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1784	return 0;
				1785	}
				1786	else {
				1787	PyErr_Format(PyExc_ValueError,
				1788	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1789	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1790	errors);
				1791	return -1;
				1792	}
				1793	}
				1794
				1795	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1796	int size,
				1797	const char *errors)
				1798	{
				1799	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1800	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1801
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1802	repr = PyString_FromStringAndSize(NULL, size);
				1803	if (repr == NULL)
				1804	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1805	if (size == 0)
				1806	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1807
				1808	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1809	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1810	while (size-- > 0) {
				1811	Py_UNICODE ch = *p++;
				1812	if (ch >= 128) {
				1813	if (ascii_encoding_error(&p, &s, errors,
				1814	"ordinal not in range(128)"))
				1815	goto onError;
				1816	}
				1817	else
				1818	*s++ = (char)ch;
				1819	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1820	/* Resize if error handling skipped some characters */
				1821	if (s - start < PyString_GET_SIZE(repr))
				1822	if (_PyString_Resize(&repr, s - start))
				1823	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1824	return repr;
				1825
				1826	onError:
				1827	Py_DECREF(repr);
				1828	return NULL;
				1829	}
				1830
				1831	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1832	{
				1833	if (!PyUnicode_Check(unicode)) {
				1834	PyErr_BadArgument();
				1835	return NULL;
				1836	}
				1837	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1838	PyUnicode_GET_SIZE(unicode),
				1839	NULL);
				1840	}
				1841
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1842	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1843
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1844	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1845
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1846	PyObject PyUnicode_DecodeMBCS(const char s,
				1847	int size,
				1848	const char *errors)
				1849	{
				1850	PyUnicodeObject *v;
				1851	Py_UNICODE *p;
				1852
				1853	/* First get the size of the result */
				1854	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1855	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1856	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1857
				1858	v = _PyUnicode_New(usize);
				1859	if (v == NULL)
				1860	return NULL;
				1861	if (usize == 0)
				1862	return (PyObject *)v;
				1863	p = PyUnicode_AS_UNICODE(v);
				1864	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1865	Py_DECREF(v);
				1866	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1867	}
				1868
				1869	return (PyObject *)v;
				1870	}
				1871
				1872	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1873	int size,
				1874	const char *errors)
				1875	{
				1876	PyObject *repr;
				1877	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1878	DWORD mbcssize;
				1879
				1880	/* If there are no characters, bail now! */
				1881	if (size==0)
				1882	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1883
				1884	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1885	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1886	if (mbcssize==0)
				1887	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1888
				1889	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1890	if (repr == NULL)
				1891	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1892	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1893	return repr;
				1894
				1895	/* Do the conversion */
				1896	s = PyString_AS_STRING(repr);
				1897	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1898	Py_DECREF(repr);
				1899	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1900	}
				1901	return repr;
				1902	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1903
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1904	#endif /* MS_WIN32 */
				1905
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1906	/* --- Character Mapping Codec -------------------------------------------- */
				1907
				1908	static
				1909	int charmap_decoding_error(const char **source,
				1910	Py_UNICODE **dest,
				1911	const char *errors,
				1912	const char *details)
				1913	{
				1914	if ((errors == NULL) \|\|
				1915	(strcmp(errors,"strict") == 0)) {
				1916	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1917	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1918	details);
				1919	return -1;
				1920	}
				1921	else if (strcmp(errors,"ignore") == 0) {
				1922	return 0;
				1923	}
				1924	else if (strcmp(errors,"replace") == 0) {
				1925	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1926	(*dest)++;
				1927	return 0;
				1928	}
				1929	else {
				1930	PyErr_Format(PyExc_ValueError,
				1931	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1932	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1933	errors);
				1934	return -1;
				1935	}
				1936	}
				1937
				1938	PyObject PyUnicode_DecodeCharmap(const char s,
				1939	int size,
				1940	PyObject *mapping,
				1941	const char *errors)
				1942	{
				1943	PyUnicodeObject *v;
				1944	Py_UNICODE *p;
				1945
				1946	/* Default to Latin-1 */
				1947	if (mapping == NULL)
				1948	return PyUnicode_DecodeLatin1(s, size, errors);
				1949
				1950	v = _PyUnicode_New(size);
				1951	if (v == NULL)
				1952	goto onError;
				1953	if (size == 0)
				1954	return (PyObject *)v;
				1955	p = PyUnicode_AS_UNICODE(v);
				1956	while (size-- > 0) {
				1957	unsigned char ch = *s++;
				1958	PyObject w, x;
				1959
				1960	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1961	w = PyInt_FromLong((long)ch);
				1962	if (w == NULL)
				1963	goto onError;
				1964	x = PyObject_GetItem(mapping, w);
				1965	Py_DECREF(w);
				1966	if (x == NULL) {
				1967	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1968	/* No mapping found: default to Latin-1 mapping */
				1969	PyErr_Clear();
				1970	*p++ = (Py_UNICODE)ch;
				1971	continue;
				1972	}
				1973	goto onError;
				1974	}
				1975
				1976	/* Apply mapping */
				1977	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1978	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1979	if (value < 0 \|\| value > 65535) {
				1980	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1981	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1982	Py_DECREF(x);
				1983	goto onError;
				1984	}
				1985	*p++ = (Py_UNICODE)value;
				1986	}
				1987	else if (x == Py_None) {
				1988	/* undefined mapping */
				1989	if (charmap_decoding_error(&s, &p, errors,
				1990	"character maps to <undefined>")) {
				1991	Py_DECREF(x);
				1992	goto onError;
				1993	}
				1994	}
				1995	else if (PyUnicode_Check(x)) {
				1996	if (PyUnicode_GET_SIZE(x) != 1) {
				1997	/* 1-n mapping */
				1998	PyErr_SetString(PyExc_NotImplementedError,
				1999	"1-n mappings are currently not implemented");
				2000	Py_DECREF(x);
				2001	goto onError;
				2002	}
				2003	p++ = PyUnicode_AS_UNICODE(x);
				2004	}
				2005	else {
				2006	/* wrong return value */
				2007	PyErr_SetString(PyExc_TypeError,
				2008	"character mapping must return integer, None or unicode");
				2009	Py_DECREF(x);
				2010	goto onError;
				2011	}
				2012	Py_DECREF(x);
				2013	}
				2014	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2015	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2016	goto onError;
				2017	return (PyObject *)v;
				2018
				2019	onError:
				2020	Py_XDECREF(v);
				2021	return NULL;
				2022	}
				2023
				2024	static
				2025	int charmap_encoding_error(const Py_UNICODE **source,
				2026	char **dest,
				2027	const char *errors,
				2028	const char *details)
				2029	{
				2030	if ((errors == NULL) \|\|
				2031	(strcmp(errors,"strict") == 0)) {
				2032	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2033	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2034	details);
				2035	return -1;
				2036	}
				2037	else if (strcmp(errors,"ignore") == 0) {
				2038	return 0;
				2039	}
				2040	else if (strcmp(errors,"replace") == 0) {
				2041	**dest = '?';
				2042	(*dest)++;
				2043	return 0;
				2044	}
				2045	else {
				2046	PyErr_Format(PyExc_ValueError,
				2047	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2048	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2049	errors);
				2050	return -1;
				2051	}
				2052	}
				2053
				2054	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2055	int size,
				2056	PyObject *mapping,
				2057	const char *errors)
				2058	{
				2059	PyObject *v;
				2060	char *s;
				2061
				2062	/* Default to Latin-1 */
				2063	if (mapping == NULL)
				2064	return PyUnicode_EncodeLatin1(p, size, errors);
				2065
				2066	v = PyString_FromStringAndSize(NULL, size);
				2067	if (v == NULL)
				2068	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2069	if (size == 0)
				2070	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2071	s = PyString_AS_STRING(v);
				2072	while (size-- > 0) {
				2073	Py_UNICODE ch = *p++;
				2074	PyObject w, x;
				2075
				2076	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2077	w = PyInt_FromLong((long)ch);
				2078	if (w == NULL)
				2079	goto onError;
				2080	x = PyObject_GetItem(mapping, w);
				2081	Py_DECREF(w);
				2082	if (x == NULL) {
				2083	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2084	/* No mapping found: default to Latin-1 mapping if possible */
				2085	PyErr_Clear();
				2086	if (ch < 256) {
				2087	*s++ = (char)ch;
				2088	continue;
				2089	}
				2090	else if (!charmap_encoding_error(&p, &s, errors,
				2091	"missing character mapping"))
				2092	continue;
				2093	}
				2094	goto onError;
				2095	}
				2096
				2097	/* Apply mapping */
				2098	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2099	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2100	if (value < 0 \|\| value > 255) {
				2101	PyErr_SetString(PyExc_TypeError,
				2102	"character mapping must be in range(256)");
				2103	Py_DECREF(x);
				2104	goto onError;
				2105	}
				2106	*s++ = (char)value;
				2107	}
				2108	else if (x == Py_None) {
				2109	/* undefined mapping */
				2110	if (charmap_encoding_error(&p, &s, errors,
				2111	"character maps to <undefined>")) {
				2112	Py_DECREF(x);
				2113	goto onError;
				2114	}
				2115	}
				2116	else if (PyString_Check(x)) {
				2117	if (PyString_GET_SIZE(x) != 1) {
				2118	/* 1-n mapping */
				2119	PyErr_SetString(PyExc_NotImplementedError,
				2120	"1-n mappings are currently not implemented");
				2121	Py_DECREF(x);
				2122	goto onError;
				2123	}
				2124	s++ = PyString_AS_STRING(x);
				2125	}
				2126	else {
				2127	/* wrong return value */
				2128	PyErr_SetString(PyExc_TypeError,
				2129	"character mapping must return integer, None or unicode");
				2130	Py_DECREF(x);
				2131	goto onError;
				2132	}
				2133	Py_DECREF(x);
				2134	}
				2135	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2136	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2137	goto onError;
				2138	return v;
				2139
				2140	onError:
				2141	Py_DECREF(v);
				2142	return NULL;
				2143	}
				2144
				2145	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2146	PyObject *mapping)
				2147	{
				2148	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2149	PyErr_BadArgument();
				2150	return NULL;
				2151	}
				2152	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2153	PyUnicode_GET_SIZE(unicode),
				2154	mapping,
				2155	NULL);
				2156	}
				2157
				2158	static
				2159	int translate_error(const Py_UNICODE **source,
				2160	Py_UNICODE **dest,
				2161	const char *errors,
				2162	const char *details)
				2163	{
				2164	if ((errors == NULL) \|\|
				2165	(strcmp(errors,"strict") == 0)) {
				2166	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2167	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2168	details);
				2169	return -1;
				2170	}
				2171	else if (strcmp(errors,"ignore") == 0) {
				2172	return 0;
				2173	}
				2174	else if (strcmp(errors,"replace") == 0) {
				2175	**dest = '?';
				2176	(*dest)++;
				2177	return 0;
				2178	}
				2179	else {
				2180	PyErr_Format(PyExc_ValueError,
				2181	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2182	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2183	errors);
				2184	return -1;
				2185	}
				2186	}
				2187
				2188	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2189	int size,
				2190	PyObject *mapping,
				2191	const char *errors)
				2192	{
				2193	PyUnicodeObject *v;
				2194	Py_UNICODE *p;
				2195
				2196	if (mapping == NULL) {
				2197	PyErr_BadArgument();
				2198	return NULL;
				2199	}
				2200
				2201	/* Output will never be longer than input */
				2202	v = _PyUnicode_New(size);
				2203	if (v == NULL)
				2204	goto onError;
				2205	if (size == 0)
				2206	goto done;
				2207	p = PyUnicode_AS_UNICODE(v);
				2208	while (size-- > 0) {
				2209	Py_UNICODE ch = *s++;
				2210	PyObject w, x;
				2211
				2212	/* Get mapping */
				2213	w = PyInt_FromLong(ch);
				2214	if (w == NULL)
				2215	goto onError;
				2216	x = PyObject_GetItem(mapping, w);
				2217	Py_DECREF(w);
				2218	if (x == NULL) {
				2219	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2220	/* No mapping found: default to 1-1 mapping */
				2221	PyErr_Clear();
				2222	*p++ = ch;
				2223	continue;
				2224	}
				2225	goto onError;
				2226	}
				2227
				2228	/* Apply mapping */
				2229	if (PyInt_Check(x))
				2230	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2231	else if (x == Py_None) {
				2232	/* undefined mapping */
				2233	if (translate_error(&s, &p, errors,
				2234	"character maps to <undefined>")) {
				2235	Py_DECREF(x);
				2236	goto onError;
				2237	}
				2238	}
				2239	else if (PyUnicode_Check(x)) {
				2240	if (PyUnicode_GET_SIZE(x) != 1) {
				2241	/* 1-n mapping */
				2242	PyErr_SetString(PyExc_NotImplementedError,
				2243	"1-n mappings are currently not implemented");
				2244	Py_DECREF(x);
				2245	goto onError;
				2246	}
				2247	p++ = PyUnicode_AS_UNICODE(x);
				2248	}
				2249	else {
				2250	/* wrong return value */
				2251	PyErr_SetString(PyExc_TypeError,
				2252	"translate mapping must return integer, None or unicode");
				2253	Py_DECREF(x);
				2254	goto onError;
				2255	}
				2256	Py_DECREF(x);
				2257	}
				2258	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2259	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2260	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2261
				2262	done:
				2263	return (PyObject *)v;
				2264
				2265	onError:
				2266	Py_XDECREF(v);
				2267	return NULL;
				2268	}
				2269
				2270	PyObject PyUnicode_Translate(PyObject str,
				2271	PyObject *mapping,
				2272	const char *errors)
				2273	{
				2274	PyObject *result;
				2275
				2276	str = PyUnicode_FromObject(str);
				2277	if (str == NULL)
				2278	goto onError;
				2279	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2280	PyUnicode_GET_SIZE(str),
				2281	mapping,
				2282	errors);
				2283	Py_DECREF(str);
				2284	return result;
				2285
				2286	onError:
				2287	Py_XDECREF(str);
				2288	return NULL;
				2289	}
				2290
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2291	/* --- Decimal Encoder ---------------------------------------------------- */
				2292
				2293	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2294	int length,
				2295	char *output,
				2296	const char *errors)
				2297	{
				2298	Py_UNICODE p, end;
				2299
				2300	if (output == NULL) {
				2301	PyErr_BadArgument();
				2302	return -1;
				2303	}
				2304
				2305	p = s;
				2306	end = s + length;
				2307	while (p < end) {
				2308	register Py_UNICODE ch = *p++;
				2309	int decimal;
				2310
				2311	if (Py_UNICODE_ISSPACE(ch)) {
				2312	*output++ = ' ';
				2313	continue;
				2314	}
				2315	decimal = Py_UNICODE_TODECIMAL(ch);
				2316	if (decimal >= 0) {
				2317	*output++ = '0' + decimal;
				2318	continue;
				2319	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2320	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2321	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2322	continue;
				2323	}
				2324	/* All other characters are considered invalid */
				2325	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2326	PyErr_SetString(PyExc_ValueError,
				2327	"invalid decimal Unicode string");
				2328	goto onError;
				2329	}
				2330	else if (strcmp(errors, "ignore") == 0)
				2331	continue;
				2332	else if (strcmp(errors, "replace") == 0) {
				2333	*output++ = '?';
				2334	continue;
				2335	}
				2336	}
				2337	/* 0-terminate the output string */
				2338	*output++ = '\0';
				2339	return 0;
				2340
				2341	onError:
				2342	return -1;
				2343	}
				2344
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2345	/* --- Helpers ------------------------------------------------------------ */
				2346
				2347	static
				2348	int count(PyUnicodeObject *self,
				2349	int start,
				2350	int end,
				2351	PyUnicodeObject *substring)
				2352	{
				2353	int count = 0;
				2354
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2355	if (substring->length == 0)
				2356	return (end - start + 1);
				2357
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2358	end -= substring->length;
				2359
				2360	while (start <= end)
				2361	if (Py_UNICODE_MATCH(self, start, substring)) {
				2362	count++;
				2363	start += substring->length;
				2364	} else
				2365	start++;
				2366
				2367	return count;
				2368	}
				2369
				2370	int PyUnicode_Count(PyObject *str,
				2371	PyObject *substr,
				2372	int start,
				2373	int end)
				2374	{
				2375	int result;
				2376
				2377	str = PyUnicode_FromObject(str);
				2378	if (str == NULL)
				2379	return -1;
				2380	substr = PyUnicode_FromObject(substr);
				2381	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2382	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2383	return -1;
				2384	}
				2385
				2386	result = count((PyUnicodeObject *)str,
				2387	start, end,
				2388	(PyUnicodeObject *)substr);
				2389
				2390	Py_DECREF(str);
				2391	Py_DECREF(substr);
				2392	return result;
				2393	}
				2394
				2395	static
				2396	int findstring(PyUnicodeObject *self,
				2397	PyUnicodeObject *substring,
				2398	int start,
				2399	int end,
				2400	int direction)
				2401	{
				2402	if (start < 0)
				2403	start += self->length;
				2404	if (start < 0)
				2405	start = 0;
				2406
				2407	if (substring->length == 0)
				2408	return start;
				2409
				2410	if (end > self->length)
				2411	end = self->length;
				2412	if (end < 0)
				2413	end += self->length;
				2414	if (end < 0)
				2415	end = 0;
				2416
				2417	end -= substring->length;
				2418
				2419	if (direction < 0) {
				2420	for (; end >= start; end--)
				2421	if (Py_UNICODE_MATCH(self, end, substring))
				2422	return end;
				2423	} else {
				2424	for (; start <= end; start++)
				2425	if (Py_UNICODE_MATCH(self, start, substring))
				2426	return start;
				2427	}
				2428
				2429	return -1;
				2430	}
				2431
				2432	int PyUnicode_Find(PyObject *str,
				2433	PyObject *substr,
				2434	int start,
				2435	int end,
				2436	int direction)
				2437	{
				2438	int result;
				2439
				2440	str = PyUnicode_FromObject(str);
				2441	if (str == NULL)
				2442	return -1;
				2443	substr = PyUnicode_FromObject(substr);
				2444	if (substr == NULL) {
				2445	Py_DECREF(substr);
				2446	return -1;
				2447	}
				2448
				2449	result = findstring((PyUnicodeObject *)str,
				2450	(PyUnicodeObject *)substr,
				2451	start, end, direction);
				2452	Py_DECREF(str);
				2453	Py_DECREF(substr);
				2454	return result;
				2455	}
				2456
				2457	static
				2458	int tailmatch(PyUnicodeObject *self,
				2459	PyUnicodeObject *substring,
				2460	int start,
				2461	int end,
				2462	int direction)
				2463	{
				2464	if (start < 0)
				2465	start += self->length;
				2466	if (start < 0)
				2467	start = 0;
				2468
				2469	if (substring->length == 0)
				2470	return 1;
				2471
				2472	if (end > self->length)
				2473	end = self->length;
				2474	if (end < 0)
				2475	end += self->length;
				2476	if (end < 0)
				2477	end = 0;
				2478
				2479	end -= substring->length;
				2480	if (end < start)
				2481	return 0;
				2482
				2483	if (direction > 0) {
				2484	if (Py_UNICODE_MATCH(self, end, substring))
				2485	return 1;
				2486	} else {
				2487	if (Py_UNICODE_MATCH(self, start, substring))
				2488	return 1;
				2489	}
				2490
				2491	return 0;
				2492	}
				2493
				2494	int PyUnicode_Tailmatch(PyObject *str,
				2495	PyObject *substr,
				2496	int start,
				2497	int end,
				2498	int direction)
				2499	{
				2500	int result;
				2501
				2502	str = PyUnicode_FromObject(str);
				2503	if (str == NULL)
				2504	return -1;
				2505	substr = PyUnicode_FromObject(substr);
				2506	if (substr == NULL) {
				2507	Py_DECREF(substr);
				2508	return -1;
				2509	}
				2510
				2511	result = tailmatch((PyUnicodeObject *)str,
				2512	(PyUnicodeObject *)substr,
				2513	start, end, direction);
				2514	Py_DECREF(str);
				2515	Py_DECREF(substr);
				2516	return result;
				2517	}
				2518
				2519	static
				2520	const Py_UNICODE findchar(const Py_UNICODE s,
				2521	int size,
				2522	Py_UNICODE ch)
				2523	{
				2524	/* like wcschr, but doesn't stop at NULL characters */
				2525
				2526	while (size-- > 0) {
				2527	if (*s == ch)
				2528	return s;
				2529	s++;
				2530	}
				2531
				2532	return NULL;
				2533	}
				2534
				2535	/* Apply fixfct filter to the Unicode object self and return a
				2536	reference to the modified object */
				2537
				2538	static
				2539	PyObject fixup(PyUnicodeObject self,
				2540	int (fixfct)(PyUnicodeObject s))
				2541	{
				2542
				2543	PyUnicodeObject *u;
				2544
				2545	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2546	self->length);
				2547	if (u == NULL)
				2548	return NULL;
				2549	if (!fixfct(u)) {
				2550	/* fixfct should return TRUE if it modified the buffer. If
				2551	FALSE, return a reference to the original buffer instead
				2552	(to save space, not time) */
				2553	Py_INCREF(self);
				2554	Py_DECREF(u);
				2555	return (PyObject*) self;
				2556	}
				2557	return (PyObject*) u;
				2558	}
				2559
				2560	static
				2561	int fixupper(PyUnicodeObject *self)
				2562	{
				2563	int len = self->length;
				2564	Py_UNICODE *s = self->str;
				2565	int status = 0;
				2566
				2567	while (len-- > 0) {
				2568	register Py_UNICODE ch;
				2569
				2570	ch = Py_UNICODE_TOUPPER(*s);
				2571	if (ch != *s) {
				2572	status = 1;
				2573	*s = ch;
				2574	}
				2575	s++;
				2576	}
				2577
				2578	return status;
				2579	}
				2580
				2581	static
				2582	int fixlower(PyUnicodeObject *self)
				2583	{
				2584	int len = self->length;
				2585	Py_UNICODE *s = self->str;
				2586	int status = 0;
				2587
				2588	while (len-- > 0) {
				2589	register Py_UNICODE ch;
				2590
				2591	ch = Py_UNICODE_TOLOWER(*s);
				2592	if (ch != *s) {
				2593	status = 1;
				2594	*s = ch;
				2595	}
				2596	s++;
				2597	}
				2598
				2599	return status;
				2600	}
				2601
				2602	static
				2603	int fixswapcase(PyUnicodeObject *self)
				2604	{
				2605	int len = self->length;
				2606	Py_UNICODE *s = self->str;
				2607	int status = 0;
				2608
				2609	while (len-- > 0) {
				2610	if (Py_UNICODE_ISUPPER(*s)) {
				2611	s = Py_UNICODE_TOLOWER(s);
				2612	status = 1;
				2613	} else if (Py_UNICODE_ISLOWER(*s)) {
				2614	s = Py_UNICODE_TOUPPER(s);
				2615	status = 1;
				2616	}
				2617	s++;
				2618	}
				2619
				2620	return status;
				2621	}
				2622
				2623	static
				2624	int fixcapitalize(PyUnicodeObject *self)
				2625	{
				2626	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2627	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2628	return 1;
				2629	}
				2630	return 0;
				2631	}
				2632
				2633	static
				2634	int fixtitle(PyUnicodeObject *self)
				2635	{
				2636	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2637	register Py_UNICODE *e;
				2638	int previous_is_cased;
				2639
				2640	/* Shortcut for single character strings */
				2641	if (PyUnicode_GET_SIZE(self) == 1) {
				2642	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2643	if (*p != ch) {
				2644	*p = ch;
				2645	return 1;
				2646	}
				2647	else
				2648	return 0;
				2649	}
				2650
				2651	e = p + PyUnicode_GET_SIZE(self);
				2652	previous_is_cased = 0;
				2653	for (; p < e; p++) {
				2654	register const Py_UNICODE ch = *p;
				2655
				2656	if (previous_is_cased)
				2657	*p = Py_UNICODE_TOLOWER(ch);
				2658	else
				2659	*p = Py_UNICODE_TOTITLE(ch);
				2660
				2661	if (Py_UNICODE_ISLOWER(ch) \|\|
				2662	Py_UNICODE_ISUPPER(ch) \|\|
				2663	Py_UNICODE_ISTITLE(ch))
				2664	previous_is_cased = 1;
				2665	else
				2666	previous_is_cased = 0;
				2667	}
				2668	return 1;
				2669	}
				2670
				2671	PyObject PyUnicode_Join(PyObject separator,
				2672	PyObject *seq)
				2673	{
				2674	Py_UNICODE *sep;
				2675	int seplen;
				2676	PyUnicodeObject *res = NULL;
				2677	int reslen = 0;
				2678	Py_UNICODE *p;
				2679	int seqlen = 0;
				2680	int sz = 100;
				2681	int i;
				2682
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2683	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2684	if (seqlen < 0 && PyErr_Occurred())
				2685	return NULL;
				2686
				2687	if (separator == NULL) {
				2688	Py_UNICODE blank = ' ';
				2689	sep = &blank;
				2690	seplen = 1;
				2691	}
				2692	else {
				2693	separator = PyUnicode_FromObject(separator);
				2694	if (separator == NULL)
				2695	return NULL;
				2696	sep = PyUnicode_AS_UNICODE(separator);
				2697	seplen = PyUnicode_GET_SIZE(separator);
				2698	}
				2699
				2700	res = _PyUnicode_New(sz);
				2701	if (res == NULL)
				2702	goto onError;
				2703	p = PyUnicode_AS_UNICODE(res);
				2704	reslen = 0;
				2705
				2706	for (i = 0; i < seqlen; i++) {
				2707	int itemlen;
				2708	PyObject *item;
				2709
				2710	item = PySequence_GetItem(seq, i);
				2711	if (item == NULL)
				2712	goto onError;
				2713	if (!PyUnicode_Check(item)) {
				2714	PyObject *v;
				2715	v = PyUnicode_FromObject(item);
				2716	Py_DECREF(item);
				2717	item = v;
				2718	if (item == NULL)
				2719	goto onError;
				2720	}
				2721	itemlen = PyUnicode_GET_SIZE(item);
				2722	while (reslen + itemlen + seplen >= sz) {
				2723	if (_PyUnicode_Resize(res, sz*2))
				2724	goto onError;
				2725	sz *= 2;
				2726	p = PyUnicode_AS_UNICODE(res) + reslen;
				2727	}
				2728	if (i > 0) {
				2729	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2730	p += seplen;
				2731	reslen += seplen;
				2732	}
				2733	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2734	p += itemlen;
				2735	reslen += itemlen;
				2736	Py_DECREF(item);
				2737	}
				2738	if (_PyUnicode_Resize(res, reslen))
				2739	goto onError;
				2740
				2741	Py_XDECREF(separator);
				2742	return (PyObject *)res;
				2743
				2744	onError:
				2745	Py_XDECREF(separator);
				2746	Py_DECREF(res);
				2747	return NULL;
				2748	}
				2749
				2750	static
				2751	PyUnicodeObject pad(PyUnicodeObject self,
				2752	int left,
				2753	int right,
				2754	Py_UNICODE fill)
				2755	{
				2756	PyUnicodeObject *u;
				2757
				2758	if (left < 0)
				2759	left = 0;
				2760	if (right < 0)
				2761	right = 0;
				2762
				2763	if (left == 0 && right == 0) {
				2764	Py_INCREF(self);
				2765	return self;
				2766	}
				2767
				2768	u = _PyUnicode_New(left + self->length + right);
				2769	if (u) {
				2770	if (left)
				2771	Py_UNICODE_FILL(u->str, fill, left);
				2772	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2773	if (right)
				2774	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2775	}
				2776
				2777	return u;
				2778	}
				2779
				2780	#define SPLIT_APPEND(data, left, right) \
				2781	str = PyUnicode_FromUnicode(data + left, right - left); \
				2782	if (!str) \
				2783	goto onError; \
				2784	if (PyList_Append(list, str)) { \
				2785	Py_DECREF(str); \
				2786	goto onError; \
				2787	} \
				2788	else \
				2789	Py_DECREF(str);
				2790
				2791	static
				2792	PyObject split_whitespace(PyUnicodeObject self,
				2793	PyObject *list,
				2794	int maxcount)
				2795	{
				2796	register int i;
				2797	register int j;
				2798	int len = self->length;
				2799	PyObject *str;
				2800
				2801	for (i = j = 0; i < len; ) {
				2802	/* find a token */
				2803	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2804	i++;
				2805	j = i;
				2806	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2807	i++;
				2808	if (j < i) {
				2809	if (maxcount-- <= 0)
				2810	break;
				2811	SPLIT_APPEND(self->str, j, i);
				2812	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2813	i++;
				2814	j = i;
				2815	}
				2816	}
				2817	if (j < len) {
				2818	SPLIT_APPEND(self->str, j, len);
				2819	}
				2820	return list;
				2821
				2822	onError:
				2823	Py_DECREF(list);
				2824	return NULL;
				2825	}
				2826
				2827	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2828	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2829	{
				2830	register int i;
				2831	register int j;
				2832	int len;
				2833	PyObject *list;
				2834	PyObject *str;
				2835	Py_UNICODE *data;
				2836
				2837	string = PyUnicode_FromObject(string);
				2838	if (string == NULL)
				2839	return NULL;
				2840	data = PyUnicode_AS_UNICODE(string);
				2841	len = PyUnicode_GET_SIZE(string);
				2842
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2843	list = PyList_New(0);
				2844	if (!list)
				2845	goto onError;
				2846
				2847	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2848	int eol;
				2849
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2850	/* Find a line and append it */
				2851	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2852	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2853
				2854	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2855	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2856	if (i < len) {
				2857	if (data[i] == '\r' && i + 1 < len &&
				2858	data[i+1] == '\n')
				2859	i += 2;
				2860	else
				2861	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2862	if (keepends)
				2863	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2864	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2865	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2866	j = i;
				2867	}
				2868	if (j < len) {
				2869	SPLIT_APPEND(data, j, len);
				2870	}
				2871
				2872	Py_DECREF(string);
				2873	return list;
				2874
				2875	onError:
				2876	Py_DECREF(list);
				2877	Py_DECREF(string);
				2878	return NULL;
				2879	}
				2880
				2881	static
				2882	PyObject split_char(PyUnicodeObject self,
				2883	PyObject *list,
				2884	Py_UNICODE ch,
				2885	int maxcount)
				2886	{
				2887	register int i;
				2888	register int j;
				2889	int len = self->length;
				2890	PyObject *str;
				2891
				2892	for (i = j = 0; i < len; ) {
				2893	if (self->str[i] == ch) {
				2894	if (maxcount-- <= 0)
				2895	break;
				2896	SPLIT_APPEND(self->str, j, i);
				2897	i = j = i + 1;
				2898	} else
				2899	i++;
				2900	}
				2901	if (j <= len) {
				2902	SPLIT_APPEND(self->str, j, len);
				2903	}
				2904	return list;
				2905
				2906	onError:
				2907	Py_DECREF(list);
				2908	return NULL;
				2909	}
				2910
				2911	static
				2912	PyObject split_substring(PyUnicodeObject self,
				2913	PyObject *list,
				2914	PyUnicodeObject *substring,
				2915	int maxcount)
				2916	{
				2917	register int i;
				2918	register int j;
				2919	int len = self->length;
				2920	int sublen = substring->length;
				2921	PyObject *str;
				2922
				2923	for (i = j = 0; i < len - sublen; ) {
				2924	if (Py_UNICODE_MATCH(self, i, substring)) {
				2925	if (maxcount-- <= 0)
				2926	break;
				2927	SPLIT_APPEND(self->str, j, i);
				2928	i = j = i + sublen;
				2929	} else
				2930	i++;
				2931	}
				2932	if (j <= len) {
				2933	SPLIT_APPEND(self->str, j, len);
				2934	}
				2935	return list;
				2936
				2937	onError:
				2938	Py_DECREF(list);
				2939	return NULL;
				2940	}
				2941
				2942	#undef SPLIT_APPEND
				2943
				2944	static
				2945	PyObject split(PyUnicodeObject self,
				2946	PyUnicodeObject *substring,
				2947	int maxcount)
				2948	{
				2949	PyObject *list;
				2950
				2951	if (maxcount < 0)
				2952	maxcount = INT_MAX;
				2953
				2954	list = PyList_New(0);
				2955	if (!list)
				2956	return NULL;
				2957
				2958	if (substring == NULL)
				2959	return split_whitespace(self,list,maxcount);
				2960
				2961	else if (substring->length == 1)
				2962	return split_char(self,list,substring->str[0],maxcount);
				2963
				2964	else if (substring->length == 0) {
				2965	Py_DECREF(list);
				2966	PyErr_SetString(PyExc_ValueError, "empty separator");
				2967	return NULL;
				2968	}
				2969	else
				2970	return split_substring(self,list,substring,maxcount);
				2971	}
				2972
				2973	static
				2974	PyObject strip(PyUnicodeObject self,
				2975	int left,
				2976	int right)
				2977	{
				2978	Py_UNICODE *p = self->str;
				2979	int start = 0;
				2980	int end = self->length;
				2981
				2982	if (left)
				2983	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2984	start++;
				2985
				2986	if (right)
				2987	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2988	end--;
				2989
				2990	if (start == 0 && end == self->length) {
				2991	/* couldn't strip anything off, return original string */
				2992	Py_INCREF(self);
				2993	return (PyObject*) self;
				2994	}
				2995
				2996	return (PyObject*) PyUnicode_FromUnicode(
				2997	self->str + start,
				2998	end - start
				2999	);
				3000	}
				3001
				3002	static
				3003	PyObject replace(PyUnicodeObject self,
				3004	PyUnicodeObject *str1,
				3005	PyUnicodeObject *str2,
				3006	int maxcount)
				3007	{
				3008	PyUnicodeObject *u;
				3009
				3010	if (maxcount < 0)
				3011	maxcount = INT_MAX;
				3012
				3013	if (str1->length == 1 && str2->length == 1) {
				3014	int i;
				3015
				3016	/* replace characters */
				3017	if (!findchar(self->str, self->length, str1->str[0])) {
				3018	/* nothing to replace, return original string */
				3019	Py_INCREF(self);
				3020	u = self;
				3021	} else {
				3022	Py_UNICODE u1 = str1->str[0];
				3023	Py_UNICODE u2 = str2->str[0];
				3024
				3025	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3026	self->str,
				3027	self->length
				3028	);
				3029	if (u)
				3030	for (i = 0; i < u->length; i++)
				3031	if (u->str[i] == u1) {
				3032	if (--maxcount < 0)
				3033	break;
				3034	u->str[i] = u2;
				3035	}
				3036	}
				3037
				3038	} else {
				3039	int n, i;
				3040	Py_UNICODE *p;
				3041
				3042	/* replace strings */
				3043	n = count(self, 0, self->length, str1);
				3044	if (n > maxcount)
				3045	n = maxcount;
				3046	if (n == 0) {
				3047	/* nothing to replace, return original string */
				3048	Py_INCREF(self);
				3049	u = self;
				3050	} else {
				3051	u = _PyUnicode_New(
				3052	self->length + n * (str2->length - str1->length));
				3053	if (u) {
				3054	i = 0;
				3055	p = u->str;
				3056	while (i <= self->length - str1->length)
				3057	if (Py_UNICODE_MATCH(self, i, str1)) {
				3058	/* replace string segment */
				3059	Py_UNICODE_COPY(p, str2->str, str2->length);
				3060	p += str2->length;
				3061	i += str1->length;
				3062	if (--n <= 0) {
				3063	/* copy remaining part */
				3064	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3065	break;
				3066	}
				3067	} else
				3068	*p++ = self->str[i++];
				3069	}
				3070	}
				3071	}
				3072
				3073	return (PyObject *) u;
				3074	}
				3075
				3076	/* --- Unicode Object Methods --------------------------------------------- */
				3077
				3078	static char title__doc__[] =
				3079	"S.title() -> unicode\n\
				3080	\n\
				3081	Return a titlecased version of S, i.e. words start with title case\n\
				3082	characters, all remaining cased characters have lower case.";
				3083
				3084	static PyObject*
				3085	unicode_title(PyUnicodeObject self, PyObject args)
				3086	{
				3087	if (!PyArg_NoArgs(args))
				3088	return NULL;
				3089	return fixup(self, fixtitle);
				3090	}
				3091
				3092	static char capitalize__doc__[] =
				3093	"S.capitalize() -> unicode\n\
				3094	\n\
				3095	Return a capitalized version of S, i.e. make the first character\n\
				3096	have upper case.";
				3097
				3098	static PyObject*
				3099	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3100	{
				3101	if (!PyArg_NoArgs(args))
				3102	return NULL;
				3103	return fixup(self, fixcapitalize);
				3104	}
				3105
				3106	#if 0
				3107	static char capwords__doc__[] =
				3108	"S.capwords() -> unicode\n\
				3109	\n\
				3110	Apply .capitalize() to all words in S and return the result with\n\
				3111	normalized whitespace (all whitespace strings are replaced by ' ').";
				3112
				3113	static PyObject*
				3114	unicode_capwords(PyUnicodeObject self, PyObject args)
				3115	{
				3116	PyObject *list;
				3117	PyObject *item;
				3118	int i;
				3119
				3120	if (!PyArg_NoArgs(args))
				3121	return NULL;
				3122
				3123	/* Split into words */
				3124	list = split(self, NULL, -1);
				3125	if (!list)
				3126	return NULL;
				3127
				3128	/* Capitalize each word */
				3129	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3130	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3131	fixcapitalize);
				3132	if (item == NULL)
				3133	goto onError;
				3134	Py_DECREF(PyList_GET_ITEM(list, i));
				3135	PyList_SET_ITEM(list, i, item);
				3136	}
				3137
				3138	/* Join the words to form a new string */
				3139	item = PyUnicode_Join(NULL, list);
				3140
				3141	onError:
				3142	Py_DECREF(list);
				3143	return (PyObject *)item;
				3144	}
				3145	#endif
				3146
				3147	static char center__doc__[] =
				3148	"S.center(width) -> unicode\n\
				3149	\n\
				3150	Return S centered in a Unicode string of length width. Padding is done\n\
				3151	using spaces.";
				3152
				3153	static PyObject *
				3154	unicode_center(PyUnicodeObject self, PyObject args)
				3155	{
				3156	int marg, left;
				3157	int width;
				3158
				3159	if (!PyArg_ParseTuple(args, "i:center", &width))
				3160	return NULL;
				3161
				3162	if (self->length >= width) {
				3163	Py_INCREF(self);
				3164	return (PyObject*) self;
				3165	}
				3166
				3167	marg = width - self->length;
				3168	left = marg / 2 + (marg & width & 1);
				3169
				3170	return (PyObject*) pad(self, left, marg - left, ' ');
				3171	}
				3172
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3173	#if 0
				3174
				3175	/* This code should go into some future Unicode collation support
				3176	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3177	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3178
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3179	/* speedy UTF-16 code point order comparison */
				3180	/* gleaned from: */
				3181	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3182
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3183	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3184	{
				3185	0, 0, 0, 0, 0, 0, 0, 0,
				3186	0, 0, 0, 0, 0, 0, 0, 0,
				3187	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3188	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3189	};
				3190
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3191	static int
				3192	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3193	{
				3194	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3195
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3196	Py_UNICODE *s1 = str1->str;
				3197	Py_UNICODE *s2 = str2->str;
				3198
				3199	len1 = str1->length;
				3200	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3201
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3202	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3203	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3204	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3205
				3206	c1 = *s1++;
				3207	c2 = *s2++;
				3208	if (c1 > (1<<11) * 26)
				3209	c1 += utf16Fixup[c1>>11];
				3210	if (c2 > (1<<11) * 26)
				3211	c2 += utf16Fixup[c2>>11];
				3212
				3213	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3214	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3215	if (diff)
				3216	return (diff < 0) ? -1 : (diff != 0);
				3217	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3218	}
				3219
				3220	return (len1 < len2) ? -1 : (len1 != len2);
				3221	}
				3222
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3223	#else
				3224
				3225	static int
				3226	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3227	{
				3228	register int len1, len2;
				3229
				3230	Py_UNICODE *s1 = str1->str;
				3231	Py_UNICODE *s2 = str2->str;
				3232
				3233	len1 = str1->length;
				3234	len2 = str2->length;
				3235
				3236	while (len1 > 0 && len2 > 0) {
				3237	register long diff;
				3238
				3239	diff = (long)s1++ - (long)s2++;
				3240	if (diff)
				3241	return (diff < 0) ? -1 : (diff != 0);
				3242	len1--; len2--;
				3243	}
				3244
				3245	return (len1 < len2) ? -1 : (len1 != len2);
				3246	}
				3247
				3248	#endif
				3249
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3250	int PyUnicode_Compare(PyObject *left,
				3251	PyObject *right)
				3252	{
				3253	PyUnicodeObject u = NULL, v = NULL;
				3254	int result;
				3255
				3256	/* Coerce the two arguments */
				3257	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3258	if (u == NULL)
				3259	goto onError;
				3260	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3261	if (v == NULL)
				3262	goto onError;
				3263
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3264	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3265	if (v == u) {
				3266	Py_DECREF(u);
				3267	Py_DECREF(v);
				3268	return 0;
				3269	}
				3270
				3271	result = unicode_compare(u, v);
				3272
				3273	Py_DECREF(u);
				3274	Py_DECREF(v);
				3275	return result;
				3276
				3277	onError:
				3278	Py_XDECREF(u);
				3279	Py_XDECREF(v);
				3280	return -1;
				3281	}
				3282
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3283	int PyUnicode_Contains(PyObject *container,
				3284	PyObject *element)
				3285	{
				3286	PyUnicodeObject u = NULL, v = NULL;
				3287	int result;
				3288	register const Py_UNICODE p, e;
				3289	register Py_UNICODE ch;
				3290
				3291	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3292	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3293	if (v == NULL) {
				3294	PyErr_SetString(PyExc_TypeError,
				3295	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3296	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3297	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3298	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3299	if (u == NULL) {
				3300	Py_DECREF(v);
				3301	goto onError;
				3302	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3303
				3304	/* Check v in u */
				3305	if (PyUnicode_GET_SIZE(v) != 1) {
				3306	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3307	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3308	goto onError;
				3309	}
				3310	ch = *PyUnicode_AS_UNICODE(v);
				3311	p = PyUnicode_AS_UNICODE(u);
				3312	e = p + PyUnicode_GET_SIZE(u);
				3313	result = 0;
				3314	while (p < e) {
				3315	if (*p++ == ch) {
				3316	result = 1;
				3317	break;
				3318	}
				3319	}
				3320
				3321	Py_DECREF(u);
				3322	Py_DECREF(v);
				3323	return result;
				3324
				3325	onError:
				3326	Py_XDECREF(u);
				3327	Py_XDECREF(v);
				3328	return -1;
				3329	}
				3330
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3331	/* Concat to string or Unicode object giving a new Unicode object. */
				3332
				3333	PyObject PyUnicode_Concat(PyObject left,
				3334	PyObject *right)
				3335	{
				3336	PyUnicodeObject u = NULL, v = NULL, *w;
				3337
				3338	/* Coerce the two arguments */
				3339	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3340	if (u == NULL)
				3341	goto onError;
				3342	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3343	if (v == NULL)
				3344	goto onError;
				3345
				3346	/* Shortcuts */
				3347	if (v == unicode_empty) {
				3348	Py_DECREF(v);
				3349	return (PyObject *)u;
				3350	}
				3351	if (u == unicode_empty) {
				3352	Py_DECREF(u);
				3353	return (PyObject *)v;
				3354	}
				3355
				3356	/* Concat the two Unicode strings */
				3357	w = _PyUnicode_New(u->length + v->length);
				3358	if (w == NULL)
				3359	goto onError;
				3360	Py_UNICODE_COPY(w->str, u->str, u->length);
				3361	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3362
				3363	Py_DECREF(u);
				3364	Py_DECREF(v);
				3365	return (PyObject *)w;
				3366
				3367	onError:
				3368	Py_XDECREF(u);
				3369	Py_XDECREF(v);
				3370	return NULL;
				3371	}
				3372
				3373	static char count__doc__[] =
				3374	"S.count(sub[, start[, end]]) -> int\n\
				3375	\n\
				3376	Return the number of occurrences of substring sub in Unicode string\n\
				3377	S[start:end]. Optional arguments start and end are\n\
				3378	interpreted as in slice notation.";
				3379
				3380	static PyObject *
				3381	unicode_count(PyUnicodeObject self, PyObject args)
				3382	{
				3383	PyUnicodeObject *substring;
				3384	int start = 0;
				3385	int end = INT_MAX;
				3386	PyObject *result;
				3387
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3388	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3389	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3390	return NULL;
				3391
				3392	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3393	(PyObject *)substring);
				3394	if (substring == NULL)
				3395	return NULL;
				3396
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3397	if (start < 0)
				3398	start += self->length;
				3399	if (start < 0)
				3400	start = 0;
				3401	if (end > self->length)
				3402	end = self->length;
				3403	if (end < 0)
				3404	end += self->length;
				3405	if (end < 0)
				3406	end = 0;
				3407
				3408	result = PyInt_FromLong((long) count(self, start, end, substring));
				3409
				3410	Py_DECREF(substring);
				3411	return result;
				3412	}
				3413
				3414	static char encode__doc__[] =
				3415	"S.encode([encoding[,errors]]) -> string\n\
				3416	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3417	Return an encoded string version of S. Default encoding is the current\n\
				3418	default string encoding. errors may be given to set a different error\n\
				3419	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3420	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3421
				3422	static PyObject *
				3423	unicode_encode(PyUnicodeObject self, PyObject args)
				3424	{
				3425	char *encoding = NULL;
				3426	char *errors = NULL;
				3427	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3428	return NULL;
				3429	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3430	}
				3431
				3432	static char expandtabs__doc__[] =
				3433	"S.expandtabs([tabsize]) -> unicode\n\
				3434	\n\
				3435	Return a copy of S where all tab characters are expanded using spaces.\n\
				3436	If tabsize is not given, a tab size of 8 characters is assumed.";
				3437
				3438	static PyObject*
				3439	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3440	{
				3441	Py_UNICODE *e;
				3442	Py_UNICODE *p;
				3443	Py_UNICODE *q;
				3444	int i, j;
				3445	PyUnicodeObject *u;
				3446	int tabsize = 8;
				3447
				3448	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3449	return NULL;
				3450
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3451	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3452	i = j = 0;
				3453	e = self->str + self->length;
				3454	for (p = self->str; p < e; p++)
				3455	if (*p == '\t') {
				3456	if (tabsize > 0)
				3457	j += tabsize - (j % tabsize);
				3458	}
				3459	else {
				3460	j++;
				3461	if (p == '\n' \|\| p == '\r') {
				3462	i += j;
				3463	j = 0;
				3464	}
				3465	}
				3466
				3467	/* Second pass: create output string and fill it */
				3468	u = _PyUnicode_New(i + j);
				3469	if (!u)
				3470	return NULL;
				3471
				3472	j = 0;
				3473	q = u->str;
				3474
				3475	for (p = self->str; p < e; p++)
				3476	if (*p == '\t') {
				3477	if (tabsize > 0) {
				3478	i = tabsize - (j % tabsize);
				3479	j += i;
				3480	while (i--)
				3481	*q++ = ' ';
				3482	}
				3483	}
				3484	else {
				3485	j++;
				3486	q++ = p;
				3487	if (p == '\n' \|\| p == '\r')
				3488	j = 0;
				3489	}
				3490
				3491	return (PyObject*) u;
				3492	}
				3493
				3494	static char find__doc__[] =
				3495	"S.find(sub [,start [,end]]) -> int\n\
				3496	\n\
				3497	Return the lowest index in S where substring sub is found,\n\
				3498	such that sub is contained within s[start,end]. Optional\n\
				3499	arguments start and end are interpreted as in slice notation.\n\
				3500	\n\
				3501	Return -1 on failure.";
				3502
				3503	static PyObject *
				3504	unicode_find(PyUnicodeObject self, PyObject args)
				3505	{
				3506	PyUnicodeObject *substring;
				3507	int start = 0;
				3508	int end = INT_MAX;
				3509	PyObject *result;
				3510
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3511	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3512	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3513	return NULL;
				3514	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3515	(PyObject *)substring);
				3516	if (substring == NULL)
				3517	return NULL;
				3518
				3519	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3520
				3521	Py_DECREF(substring);
				3522	return result;
				3523	}
				3524
				3525	static PyObject *
				3526	unicode_getitem(PyUnicodeObject *self, int index)
				3527	{
				3528	if (index < 0 \|\| index >= self->length) {
				3529	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3530	return NULL;
				3531	}
				3532
				3533	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3534	}
				3535
				3536	static long
				3537	unicode_hash(PyUnicodeObject *self)
				3538	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3539	/* Since Unicode objects compare equal to their ASCII string
				3540	counterparts, they should use the individual character values
				3541	as basis for their hash value. This is needed to assure that
				3542	strings and Unicode objects behave in the same way as
				3543	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3544
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3545	register int len;
				3546	register Py_UNICODE *p;
				3547	register long x;
				3548
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3549	if (self->hash != -1)
				3550	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3551	len = PyUnicode_GET_SIZE(self);
				3552	p = PyUnicode_AS_UNICODE(self);
				3553	x = *p << 7;
				3554	while (--len >= 0)
				3555	x = (1000003x) ^ p++;
				3556	x ^= PyUnicode_GET_SIZE(self);
				3557	if (x == -1)
				3558	x = -2;
				3559	self->hash = x;
				3560	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3561	}
				3562
				3563	static char index__doc__[] =
				3564	"S.index(sub [,start [,end]]) -> int\n\
				3565	\n\
				3566	Like S.find() but raise ValueError when the substring is not found.";
				3567
				3568	static PyObject *
				3569	unicode_index(PyUnicodeObject self, PyObject args)
				3570	{
				3571	int result;
				3572	PyUnicodeObject *substring;
				3573	int start = 0;
				3574	int end = INT_MAX;
				3575
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3576	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3577	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3578	return NULL;
				3579
				3580	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3581	(PyObject *)substring);
				3582	if (substring == NULL)
				3583	return NULL;
				3584
				3585	result = findstring(self, substring, start, end, 1);
				3586
				3587	Py_DECREF(substring);
				3588	if (result < 0) {
				3589	PyErr_SetString(PyExc_ValueError, "substring not found");
				3590	return NULL;
				3591	}
				3592	return PyInt_FromLong(result);
				3593	}
				3594
				3595	static char islower__doc__[] =
				3596	"S.islower() -> int\n\
				3597	\n\
				3598	Return 1 if all cased characters in S are lowercase and there is\n\
				3599	at least one cased character in S, 0 otherwise.";
				3600
				3601	static PyObject*
				3602	unicode_islower(PyUnicodeObject self, PyObject args)
				3603	{
				3604	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3605	register const Py_UNICODE *e;
				3606	int cased;
				3607
				3608	if (!PyArg_NoArgs(args))
				3609	return NULL;
				3610
				3611	/* Shortcut for single character strings */
				3612	if (PyUnicode_GET_SIZE(self) == 1)
				3613	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3614
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3615	/* Special case for empty strings */
				3616	if (PyString_GET_SIZE(self) == 0)
				3617	return PyInt_FromLong(0);
				3618
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3619	e = p + PyUnicode_GET_SIZE(self);
				3620	cased = 0;
				3621	for (; p < e; p++) {
				3622	register const Py_UNICODE ch = *p;
				3623
				3624	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3625	return PyInt_FromLong(0);
				3626	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3627	cased = 1;
				3628	}
				3629	return PyInt_FromLong(cased);
				3630	}
				3631
				3632	static char isupper__doc__[] =
				3633	"S.isupper() -> int\n\
				3634	\n\
				3635	Return 1 if all cased characters in S are uppercase and there is\n\
				3636	at least one cased character in S, 0 otherwise.";
				3637
				3638	static PyObject*
				3639	unicode_isupper(PyUnicodeObject self, PyObject args)
				3640	{
				3641	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3642	register const Py_UNICODE *e;
				3643	int cased;
				3644
				3645	if (!PyArg_NoArgs(args))
				3646	return NULL;
				3647
				3648	/* Shortcut for single character strings */
				3649	if (PyUnicode_GET_SIZE(self) == 1)
				3650	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3651
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3652	/* Special case for empty strings */
				3653	if (PyString_GET_SIZE(self) == 0)
				3654	return PyInt_FromLong(0);
				3655
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3656	e = p + PyUnicode_GET_SIZE(self);
				3657	cased = 0;
				3658	for (; p < e; p++) {
				3659	register const Py_UNICODE ch = *p;
				3660
				3661	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3662	return PyInt_FromLong(0);
				3663	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3664	cased = 1;
				3665	}
				3666	return PyInt_FromLong(cased);
				3667	}
				3668
				3669	static char istitle__doc__[] =
				3670	"S.istitle() -> int\n\
				3671	\n\
				3672	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3673	may only follow uncased characters and lowercase characters only cased\n\
				3674	ones. Return 0 otherwise.";
				3675
				3676	static PyObject*
				3677	unicode_istitle(PyUnicodeObject self, PyObject args)
				3678	{
				3679	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3680	register const Py_UNICODE *e;
				3681	int cased, previous_is_cased;
				3682
				3683	if (!PyArg_NoArgs(args))
				3684	return NULL;
				3685
				3686	/* Shortcut for single character strings */
				3687	if (PyUnicode_GET_SIZE(self) == 1)
				3688	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3689	(Py_UNICODE_ISUPPER(*p) != 0));
				3690
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3691	/* Special case for empty strings */
				3692	if (PyString_GET_SIZE(self) == 0)
				3693	return PyInt_FromLong(0);
				3694
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3695	e = p + PyUnicode_GET_SIZE(self);
				3696	cased = 0;
				3697	previous_is_cased = 0;
				3698	for (; p < e; p++) {
				3699	register const Py_UNICODE ch = *p;
				3700
				3701	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3702	if (previous_is_cased)
				3703	return PyInt_FromLong(0);
				3704	previous_is_cased = 1;
				3705	cased = 1;
				3706	}
				3707	else if (Py_UNICODE_ISLOWER(ch)) {
				3708	if (!previous_is_cased)
				3709	return PyInt_FromLong(0);
				3710	previous_is_cased = 1;
				3711	cased = 1;
				3712	}
				3713	else
				3714	previous_is_cased = 0;
				3715	}
				3716	return PyInt_FromLong(cased);
				3717	}
				3718
				3719	static char isspace__doc__[] =
				3720	"S.isspace() -> int\n\
				3721	\n\
				3722	Return 1 if there are only whitespace characters in S,\n\
				3723	0 otherwise.";
				3724
				3725	static PyObject*
				3726	unicode_isspace(PyUnicodeObject self, PyObject args)
				3727	{
				3728	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3729	register const Py_UNICODE *e;
				3730
				3731	if (!PyArg_NoArgs(args))
				3732	return NULL;
				3733
				3734	/* Shortcut for single character strings */
				3735	if (PyUnicode_GET_SIZE(self) == 1 &&
				3736	Py_UNICODE_ISSPACE(*p))
				3737	return PyInt_FromLong(1);
				3738
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3739	/* Special case for empty strings */
				3740	if (PyString_GET_SIZE(self) == 0)
				3741	return PyInt_FromLong(0);
				3742
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3743	e = p + PyUnicode_GET_SIZE(self);
				3744	for (; p < e; p++) {
				3745	if (!Py_UNICODE_ISSPACE(*p))
				3746	return PyInt_FromLong(0);
				3747	}
				3748	return PyInt_FromLong(1);
				3749	}
				3750
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3751	static char isalpha__doc__[] =
				3752	"S.isalpha() -> int\n\
				3753	\n\
				3754	Return 1 if all characters in S are alphabetic\n\
				3755	and there is at least one character in S, 0 otherwise.";
				3756
				3757	static PyObject*
				3758	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3759	{
				3760	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3761	register const Py_UNICODE *e;
				3762
				3763	if (!PyArg_NoArgs(args))
				3764	return NULL;
				3765
				3766	/* Shortcut for single character strings */
				3767	if (PyUnicode_GET_SIZE(self) == 1 &&
				3768	Py_UNICODE_ISALPHA(*p))
				3769	return PyInt_FromLong(1);
				3770
				3771	/* Special case for empty strings */
				3772	if (PyString_GET_SIZE(self) == 0)
				3773	return PyInt_FromLong(0);
				3774
				3775	e = p + PyUnicode_GET_SIZE(self);
				3776	for (; p < e; p++) {
				3777	if (!Py_UNICODE_ISALPHA(*p))
				3778	return PyInt_FromLong(0);
				3779	}
				3780	return PyInt_FromLong(1);
				3781	}
				3782
				3783	static char isalnum__doc__[] =
				3784	"S.isalnum() -> int\n\
				3785	\n\
				3786	Return 1 if all characters in S are alphanumeric\n\
				3787	and there is at least one character in S, 0 otherwise.";
				3788
				3789	static PyObject*
				3790	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3791	{
				3792	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3793	register const Py_UNICODE *e;
				3794
				3795	if (!PyArg_NoArgs(args))
				3796	return NULL;
				3797
				3798	/* Shortcut for single character strings */
				3799	if (PyUnicode_GET_SIZE(self) == 1 &&
				3800	Py_UNICODE_ISALNUM(*p))
				3801	return PyInt_FromLong(1);
				3802
				3803	/* Special case for empty strings */
				3804	if (PyString_GET_SIZE(self) == 0)
				3805	return PyInt_FromLong(0);
				3806
				3807	e = p + PyUnicode_GET_SIZE(self);
				3808	for (; p < e; p++) {
				3809	if (!Py_UNICODE_ISALNUM(*p))
				3810	return PyInt_FromLong(0);
				3811	}
				3812	return PyInt_FromLong(1);
				3813	}
				3814
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3815	static char isdecimal__doc__[] =
				3816	"S.isdecimal() -> int\n\
				3817	\n\
				3818	Return 1 if there are only decimal characters in S,\n\
				3819	0 otherwise.";
				3820
				3821	static PyObject*
				3822	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3823	{
				3824	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3825	register const Py_UNICODE *e;
				3826
				3827	if (!PyArg_NoArgs(args))
				3828	return NULL;
				3829
				3830	/* Shortcut for single character strings */
				3831	if (PyUnicode_GET_SIZE(self) == 1 &&
				3832	Py_UNICODE_ISDECIMAL(*p))
				3833	return PyInt_FromLong(1);
				3834
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3835	/* Special case for empty strings */
				3836	if (PyString_GET_SIZE(self) == 0)
				3837	return PyInt_FromLong(0);
				3838
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3839	e = p + PyUnicode_GET_SIZE(self);
				3840	for (; p < e; p++) {
				3841	if (!Py_UNICODE_ISDECIMAL(*p))
				3842	return PyInt_FromLong(0);
				3843	}
				3844	return PyInt_FromLong(1);
				3845	}
				3846
				3847	static char isdigit__doc__[] =
				3848	"S.isdigit() -> int\n\
				3849	\n\
				3850	Return 1 if there are only digit characters in S,\n\
				3851	0 otherwise.";
				3852
				3853	static PyObject*
				3854	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3855	{
				3856	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3857	register const Py_UNICODE *e;
				3858
				3859	if (!PyArg_NoArgs(args))
				3860	return NULL;
				3861
				3862	/* Shortcut for single character strings */
				3863	if (PyUnicode_GET_SIZE(self) == 1 &&
				3864	Py_UNICODE_ISDIGIT(*p))
				3865	return PyInt_FromLong(1);
				3866
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3867	/* Special case for empty strings */
				3868	if (PyString_GET_SIZE(self) == 0)
				3869	return PyInt_FromLong(0);
				3870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3871	e = p + PyUnicode_GET_SIZE(self);
				3872	for (; p < e; p++) {
				3873	if (!Py_UNICODE_ISDIGIT(*p))
				3874	return PyInt_FromLong(0);
				3875	}
				3876	return PyInt_FromLong(1);
				3877	}
				3878
				3879	static char isnumeric__doc__[] =
				3880	"S.isnumeric() -> int\n\
				3881	\n\
				3882	Return 1 if there are only numeric characters in S,\n\
				3883	0 otherwise.";
				3884
				3885	static PyObject*
				3886	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3887	{
				3888	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3889	register const Py_UNICODE *e;
				3890
				3891	if (!PyArg_NoArgs(args))
				3892	return NULL;
				3893
				3894	/* Shortcut for single character strings */
				3895	if (PyUnicode_GET_SIZE(self) == 1 &&
				3896	Py_UNICODE_ISNUMERIC(*p))
				3897	return PyInt_FromLong(1);
				3898
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3899	/* Special case for empty strings */
				3900	if (PyString_GET_SIZE(self) == 0)
				3901	return PyInt_FromLong(0);
				3902
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3903	e = p + PyUnicode_GET_SIZE(self);
				3904	for (; p < e; p++) {
				3905	if (!Py_UNICODE_ISNUMERIC(*p))
				3906	return PyInt_FromLong(0);
				3907	}
				3908	return PyInt_FromLong(1);
				3909	}
				3910
				3911	static char join__doc__[] =
				3912	"S.join(sequence) -> unicode\n\
				3913	\n\
				3914	Return a string which is the concatenation of the strings in the\n\
				3915	sequence. The separator between elements is S.";
				3916
				3917	static PyObject*
				3918	unicode_join(PyUnicodeObject self, PyObject args)
				3919	{
				3920	PyObject *data;
				3921	if (!PyArg_ParseTuple(args, "O:join", &data))
				3922	return NULL;
				3923
				3924	return PyUnicode_Join((PyObject *)self, data);
				3925	}
				3926
				3927	static int
				3928	unicode_length(PyUnicodeObject *self)
				3929	{
				3930	return self->length;
				3931	}
				3932
				3933	static char ljust__doc__[] =
				3934	"S.ljust(width) -> unicode\n\
				3935	\n\
				3936	Return S left justified in a Unicode string of length width. Padding is\n\
				3937	done using spaces.";
				3938
				3939	static PyObject *
				3940	unicode_ljust(PyUnicodeObject self, PyObject args)
				3941	{
				3942	int width;
				3943	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3944	return NULL;
				3945
				3946	if (self->length >= width) {
				3947	Py_INCREF(self);
				3948	return (PyObject*) self;
				3949	}
				3950
				3951	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3952	}
				3953
				3954	static char lower__doc__[] =
				3955	"S.lower() -> unicode\n\
				3956	\n\
				3957	Return a copy of the string S converted to lowercase.";
				3958
				3959	static PyObject*
				3960	unicode_lower(PyUnicodeObject self, PyObject args)
				3961	{
				3962	if (!PyArg_NoArgs(args))
				3963	return NULL;
				3964	return fixup(self, fixlower);
				3965	}
				3966
				3967	static char lstrip__doc__[] =
				3968	"S.lstrip() -> unicode\n\
				3969	\n\
				3970	Return a copy of the string S with leading whitespace removed.";
				3971
				3972	static PyObject *
				3973	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3974	{
				3975	if (!PyArg_NoArgs(args))
				3976	return NULL;
				3977	return strip(self, 1, 0);
				3978	}
				3979
				3980	static PyObject*
				3981	unicode_repeat(PyUnicodeObject *str, int len)
				3982	{
				3983	PyUnicodeObject *u;
				3984	Py_UNICODE *p;
				3985
				3986	if (len < 0)
				3987	len = 0;
				3988
				3989	if (len == 1) {
				3990	/* no repeat, return original string */
				3991	Py_INCREF(str);
				3992	return (PyObject*) str;
				3993	}
				3994
				3995	u = _PyUnicode_New(len * str->length);
				3996	if (!u)
				3997	return NULL;
				3998
				3999	p = u->str;
				4000
				4001	while (len-- > 0) {
				4002	Py_UNICODE_COPY(p, str->str, str->length);
				4003	p += str->length;
				4004	}
				4005
				4006	return (PyObject*) u;
				4007	}
				4008
				4009	PyObject PyUnicode_Replace(PyObject obj,
				4010	PyObject *subobj,
				4011	PyObject *replobj,
				4012	int maxcount)
				4013	{
				4014	PyObject *self;
				4015	PyObject *str1;
				4016	PyObject *str2;
				4017	PyObject *result;
				4018
				4019	self = PyUnicode_FromObject(obj);
				4020	if (self == NULL)
				4021	return NULL;
				4022	str1 = PyUnicode_FromObject(subobj);
				4023	if (str1 == NULL) {
				4024	Py_DECREF(self);
				4025	return NULL;
				4026	}
				4027	str2 = PyUnicode_FromObject(replobj);
				4028	if (str2 == NULL) {
				4029	Py_DECREF(self);
				4030	Py_DECREF(str1);
				4031	return NULL;
				4032	}
				4033	result = replace((PyUnicodeObject *)self,
				4034	(PyUnicodeObject *)str1,
				4035	(PyUnicodeObject *)str2,
				4036	maxcount);
				4037	Py_DECREF(self);
				4038	Py_DECREF(str1);
				4039	Py_DECREF(str2);
				4040	return result;
				4041	}
				4042
				4043	static char replace__doc__[] =
				4044	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4045	\n\
				4046	Return a copy of S with all occurrences of substring\n\
				4047	old replaced by new. If the optional argument maxsplit is\n\
				4048	given, only the first maxsplit occurrences are replaced.";
				4049
				4050	static PyObject*
				4051	unicode_replace(PyUnicodeObject self, PyObject args)
				4052	{
				4053	PyUnicodeObject *str1;
				4054	PyUnicodeObject *str2;
				4055	int maxcount = -1;
				4056	PyObject *result;
				4057
				4058	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4059	return NULL;
				4060	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4061	if (str1 == NULL)
				4062	return NULL;
				4063	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4064	if (str2 == NULL)
				4065	return NULL;
				4066
				4067	result = replace(self, str1, str2, maxcount);
				4068
				4069	Py_DECREF(str1);
				4070	Py_DECREF(str2);
				4071	return result;
				4072	}
				4073
				4074	static
				4075	PyObject unicode_repr(PyObject unicode)
				4076	{
				4077	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4078	PyUnicode_GET_SIZE(unicode),
				4079	1);
				4080	}
				4081
				4082	static char rfind__doc__[] =
				4083	"S.rfind(sub [,start [,end]]) -> int\n\
				4084	\n\
				4085	Return the highest index in S where substring sub is found,\n\
				4086	such that sub is contained within s[start,end]. Optional\n\
				4087	arguments start and end are interpreted as in slice notation.\n\
				4088	\n\
				4089	Return -1 on failure.";
				4090
				4091	static PyObject *
				4092	unicode_rfind(PyUnicodeObject self, PyObject args)
				4093	{
				4094	PyUnicodeObject *substring;
				4095	int start = 0;
				4096	int end = INT_MAX;
				4097	PyObject *result;
				4098
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4099	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4100	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4101	return NULL;
				4102	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4103	(PyObject *)substring);
				4104	if (substring == NULL)
				4105	return NULL;
				4106
				4107	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4108
				4109	Py_DECREF(substring);
				4110	return result;
				4111	}
				4112
				4113	static char rindex__doc__[] =
				4114	"S.rindex(sub [,start [,end]]) -> int\n\
				4115	\n\
				4116	Like S.rfind() but raise ValueError when the substring is not found.";
				4117
				4118	static PyObject *
				4119	unicode_rindex(PyUnicodeObject self, PyObject args)
				4120	{
				4121	int result;
				4122	PyUnicodeObject *substring;
				4123	int start = 0;
				4124	int end = INT_MAX;
				4125
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4126	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4127	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4128	return NULL;
				4129	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4130	(PyObject *)substring);
				4131	if (substring == NULL)
				4132	return NULL;
				4133
				4134	result = findstring(self, substring, start, end, -1);
				4135
				4136	Py_DECREF(substring);
				4137	if (result < 0) {
				4138	PyErr_SetString(PyExc_ValueError, "substring not found");
				4139	return NULL;
				4140	}
				4141	return PyInt_FromLong(result);
				4142	}
				4143
				4144	static char rjust__doc__[] =
				4145	"S.rjust(width) -> unicode\n\
				4146	\n\
				4147	Return S right justified in a Unicode string of length width. Padding is\n\
				4148	done using spaces.";
				4149
				4150	static PyObject *
				4151	unicode_rjust(PyUnicodeObject self, PyObject args)
				4152	{
				4153	int width;
				4154	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4155	return NULL;
				4156
				4157	if (self->length >= width) {
				4158	Py_INCREF(self);
				4159	return (PyObject*) self;
				4160	}
				4161
				4162	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4163	}
				4164
				4165	static char rstrip__doc__[] =
				4166	"S.rstrip() -> unicode\n\
				4167	\n\
				4168	Return a copy of the string S with trailing whitespace removed.";
				4169
				4170	static PyObject *
				4171	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4172	{
				4173	if (!PyArg_NoArgs(args))
				4174	return NULL;
				4175	return strip(self, 0, 1);
				4176	}
				4177
				4178	static PyObject*
				4179	unicode_slice(PyUnicodeObject *self, int start, int end)
				4180	{
				4181	/* standard clamping */
				4182	if (start < 0)
				4183	start = 0;
				4184	if (end < 0)
				4185	end = 0;
				4186	if (end > self->length)
				4187	end = self->length;
				4188	if (start == 0 && end == self->length) {
				4189	/* full slice, return original string */
				4190	Py_INCREF(self);
				4191	return (PyObject*) self;
				4192	}
				4193	if (start > end)
				4194	start = end;
				4195	/* copy slice */
				4196	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4197	end - start);
				4198	}
				4199
				4200	PyObject PyUnicode_Split(PyObject s,
				4201	PyObject *sep,
				4202	int maxsplit)
				4203	{
				4204	PyObject *result;
				4205
				4206	s = PyUnicode_FromObject(s);
				4207	if (s == NULL)
				4208	return NULL;
				4209	if (sep != NULL) {
				4210	sep = PyUnicode_FromObject(sep);
				4211	if (sep == NULL) {
				4212	Py_DECREF(s);
				4213	return NULL;
				4214	}
				4215	}
				4216
				4217	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4218
				4219	Py_DECREF(s);
				4220	Py_XDECREF(sep);
				4221	return result;
				4222	}
				4223
				4224	static char split__doc__[] =
				4225	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4226	\n\
				4227	Return a list of the words in S, using sep as the\n\
				4228	delimiter string. If maxsplit is given, at most maxsplit\n\
				4229	splits are done. If sep is not specified, any whitespace string\n\
				4230	is a separator.";
				4231
				4232	static PyObject*
				4233	unicode_split(PyUnicodeObject self, PyObject args)
				4234	{
				4235	PyObject *substring = Py_None;
				4236	int maxcount = -1;
				4237
				4238	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4239	return NULL;
				4240
				4241	if (substring == Py_None)
				4242	return split(self, NULL, maxcount);
				4243	else if (PyUnicode_Check(substring))
				4244	return split(self, (PyUnicodeObject *)substring, maxcount);
				4245	else
				4246	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4247	}
				4248
				4249	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4250	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4251	\n\
				4252	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4253	Line breaks are not included in the resulting list unless keepends\n\
				4254	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4255
				4256	static PyObject*
				4257	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4258	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4259	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4260
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4261	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4262	return NULL;
				4263
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4264	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4265	}
				4266
				4267	static
				4268	PyObject unicode_str(PyUnicodeObject self)
				4269	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4270	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4271	}
				4272
				4273	static char strip__doc__[] =
				4274	"S.strip() -> unicode\n\
				4275	\n\
				4276	Return a copy of S with leading and trailing whitespace removed.";
				4277
				4278	static PyObject *
				4279	unicode_strip(PyUnicodeObject self, PyObject args)
				4280	{
				4281	if (!PyArg_NoArgs(args))
				4282	return NULL;
				4283	return strip(self, 1, 1);
				4284	}
				4285
				4286	static char swapcase__doc__[] =
				4287	"S.swapcase() -> unicode\n\
				4288	\n\
				4289	Return a copy of S with uppercase characters converted to lowercase\n\
				4290	and vice versa.";
				4291
				4292	static PyObject*
				4293	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4294	{
				4295	if (!PyArg_NoArgs(args))
				4296	return NULL;
				4297	return fixup(self, fixswapcase);
				4298	}
				4299
				4300	static char translate__doc__[] =
				4301	"S.translate(table) -> unicode\n\
				4302	\n\
				4303	Return a copy of the string S, where all characters have been mapped\n\
				4304	through the given translation table, which must be a mapping of\n\
				4305	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4306	are left untouched. Characters mapped to None are deleted.";
				4307
				4308	static PyObject*
				4309	unicode_translate(PyUnicodeObject self, PyObject args)
				4310	{
				4311	PyObject *table;
				4312
				4313	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4314	return NULL;
				4315	return PyUnicode_TranslateCharmap(self->str,
				4316	self->length,
				4317	table,
				4318	"ignore");
				4319	}
				4320
				4321	static char upper__doc__[] =
				4322	"S.upper() -> unicode\n\
				4323	\n\
				4324	Return a copy of S converted to uppercase.";
				4325
				4326	static PyObject*
				4327	unicode_upper(PyUnicodeObject self, PyObject args)
				4328	{
				4329	if (!PyArg_NoArgs(args))
				4330	return NULL;
				4331	return fixup(self, fixupper);
				4332	}
				4333
				4334	#if 0
				4335	static char zfill__doc__[] =
				4336	"S.zfill(width) -> unicode\n\
				4337	\n\
				4338	Pad a numeric string x with zeros on the left, to fill a field\n\
				4339	of the specified width. The string x is never truncated.";
				4340
				4341	static PyObject *
				4342	unicode_zfill(PyUnicodeObject self, PyObject args)
				4343	{
				4344	int fill;
				4345	PyUnicodeObject *u;
				4346
				4347	int width;
				4348	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4349	return NULL;
				4350
				4351	if (self->length >= width) {
				4352	Py_INCREF(self);
				4353	return (PyObject*) self;
				4354	}
				4355
				4356	fill = width - self->length;
				4357
				4358	u = pad(self, fill, 0, '0');
				4359
				4360	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4361	/* move sign to beginning of string */
				4362	u->str[0] = u->str[fill];
				4363	u->str[fill] = '0';
				4364	}
				4365
				4366	return (PyObject*) u;
				4367	}
				4368	#endif
				4369
				4370	#if 0
				4371	static PyObject*
				4372	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4373	{
				4374	if (!PyArg_NoArgs(args))
				4375	return NULL;
				4376	return PyInt_FromLong(unicode_freelist_size);
				4377	}
				4378	#endif
				4379
				4380	static char startswith__doc__[] =
				4381	"S.startswith(prefix[, start[, end]]) -> int\n\
				4382	\n\
				4383	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4384	optional start, test S beginning at that position. With optional end, stop\n\
				4385	comparing S at that position.";
				4386
				4387	static PyObject *
				4388	unicode_startswith(PyUnicodeObject *self,
				4389	PyObject *args)
				4390	{
				4391	PyUnicodeObject *substring;
				4392	int start = 0;
				4393	int end = INT_MAX;
				4394	PyObject *result;
				4395
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4396	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4397	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4398	return NULL;
				4399	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4400	(PyObject *)substring);
				4401	if (substring == NULL)
				4402	return NULL;
				4403
				4404	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4405
				4406	Py_DECREF(substring);
				4407	return result;
				4408	}
				4409
				4410
				4411	static char endswith__doc__[] =
				4412	"S.endswith(suffix[, start[, end]]) -> int\n\
				4413	\n\
				4414	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4415	optional start, test S beginning at that position. With optional end, stop\n\
				4416	comparing S at that position.";
				4417
				4418	static PyObject *
				4419	unicode_endswith(PyUnicodeObject *self,
				4420	PyObject *args)
				4421	{
				4422	PyUnicodeObject *substring;
				4423	int start = 0;
				4424	int end = INT_MAX;
				4425	PyObject *result;
				4426
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4427	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4428	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4429	return NULL;
				4430	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4431	(PyObject *)substring);
				4432	if (substring == NULL)
				4433	return NULL;
				4434
				4435	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4436
				4437	Py_DECREF(substring);
				4438	return result;
				4439	}
				4440
				4441
				4442	static PyMethodDef unicode_methods[] = {
				4443
				4444	/* Order is according to common usage: often used methods should
				4445	appear first, since lookup is done sequentially. */
				4446
				4447	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4448	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4449	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4450	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4451	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4452	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4453	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4454	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4455	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4456	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4457	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4458	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4459	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4460	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4461	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4462	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4463	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4464	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4465	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4466	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4467	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4468	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4469	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4470	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4471	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4472	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4473	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4474	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4475	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4476	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4477	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4478	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4479	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4480	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4481	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4482	#if 0
				4483	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4484	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4485	#endif
				4486
				4487	#if 0
				4488	/* This one is just used for debugging the implementation. */
				4489	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4490	#endif
				4491
				4492	{NULL, NULL}
				4493	};
				4494
				4495	static PyObject *
				4496	unicode_getattr(PyUnicodeObject self, char name)
				4497	{
				4498	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4499	}
				4500
				4501	static PySequenceMethods unicode_as_sequence = {
				4502	(inquiry) unicode_length, /* sq_length */
				4503	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4504	(intargfunc) unicode_repeat, /* sq_repeat */
				4505	(intargfunc) unicode_getitem, /* sq_item */
				4506	(intintargfunc) unicode_slice, /* sq_slice */
				4507	0, /* sq_ass_item */
				4508	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4509	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4510	};
				4511
				4512	static int
				4513	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4514	int index,
				4515	const void **ptr)
				4516	{
				4517	if (index != 0) {
				4518	PyErr_SetString(PyExc_SystemError,
				4519	"accessing non-existent unicode segment");
				4520	return -1;
				4521	}
				4522	ptr = (void ) self->str;
				4523	return PyUnicode_GET_DATA_SIZE(self);
				4524	}
				4525
				4526	static int
				4527	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4528	const void **ptr)
				4529	{
				4530	PyErr_SetString(PyExc_TypeError,
				4531	"cannot use unicode as modifyable buffer");
				4532	return -1;
				4533	}
				4534
				4535	static int
				4536	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4537	int *lenp)
				4538	{
				4539	if (lenp)
				4540	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4541	return 1;
				4542	}
				4543
				4544	static int
				4545	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4546	int index,
				4547	const void **ptr)
				4548	{
				4549	PyObject *str;
				4550
				4551	if (index != 0) {
				4552	PyErr_SetString(PyExc_SystemError,
				4553	"accessing non-existent unicode segment");
				4554	return -1;
				4555	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4556	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4557	if (str == NULL)
				4558	return -1;
				4559	ptr = (void ) PyString_AS_STRING(str);
				4560	return PyString_GET_SIZE(str);
				4561	}
				4562
				4563	/* Helpers for PyUnicode_Format() */
				4564
				4565	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4566	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4567	{
				4568	int argidx = *p_argidx;
				4569	if (argidx < arglen) {
				4570	(*p_argidx)++;
				4571	if (arglen < 0)
				4572	return args;
				4573	else
				4574	return PyTuple_GetItem(args, argidx);
				4575	}
				4576	PyErr_SetString(PyExc_TypeError,
				4577	"not enough arguments for format string");
				4578	return NULL;
				4579	}
				4580
				4581	#define F_LJUST (1<<0)
				4582	#define F_SIGN (1<<1)
				4583	#define F_BLANK (1<<2)
				4584	#define F_ALT (1<<3)
				4585	#define F_ZERO (1<<4)
				4586
				4587	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4588	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4589	{
				4590	register int i;
				4591	int len;
				4592	va_list va;
				4593	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4594	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4595
				4596	/* First, format the string as char array, then expand to Py_UNICODE
				4597	array. */
				4598	charbuffer = (char *)buffer;
				4599	len = vsprintf(charbuffer, format, va);
				4600	for (i = len - 1; i >= 0; i--)
				4601	buffer[i] = (Py_UNICODE) charbuffer[i];
				4602
				4603	va_end(va);
				4604	return len;
				4605	}
				4606
				4607	static int
				4608	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4609	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4610	int flags,
				4611	int prec,
				4612	int type,
				4613	PyObject *v)
				4614	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4615	/* fmt = '%#.' + `prec` + `type`
				4616	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4617	char fmt[20];
				4618	double x;
				4619
				4620	x = PyFloat_AsDouble(v);
				4621	if (x == -1.0 && PyErr_Occurred())
				4622	return -1;
				4623	if (prec < 0)
				4624	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4625	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4626	type = 'g';
				4627	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4628	/* worst case length calc to ensure no buffer overrun:
				4629	fmt = %#.<prec>g
				4630	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4631	for any double rep.)
				4632	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4633	If prec=0 the effective precision is 1 (the leading digit is
				4634	always given), therefore increase by one to 10+prec. */
				4635	if (buflen <= (size_t)10 + (size_t)prec) {
				4636	PyErr_SetString(PyExc_OverflowError,
				4637	"formatted float is too long (precision too long?)");
				4638	return -1;
				4639	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4640	return usprintf(buf, fmt, x);
				4641	}
				4642
				4643	static int
				4644	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4645	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4646	int flags,
				4647	int prec,
				4648	int type,
				4649	PyObject *v)
				4650	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4651	/* fmt = '%#.' + `prec` + 'l' + `type`
				4652	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4653	char fmt[20];
				4654	long x;
				4655
				4656	x = PyInt_AsLong(v);
				4657	if (x == -1 && PyErr_Occurred())
				4658	return -1;
				4659	if (prec < 0)
				4660	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4661	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4662	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4663	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4664	PyErr_SetString(PyExc_OverflowError,
				4665	"formatted integer is too long (precision too long?)");
				4666	return -1;
				4667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4668	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4669	return usprintf(buf, fmt, x);
				4670	}
				4671
				4672	static int
				4673	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4674	size_t buflen,
				4675	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4676	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4677	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4678	if (PyUnicode_Check(v)) {
				4679	if (PyUnicode_GET_SIZE(v) != 1)
				4680	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4681	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4682	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4683
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4684	else if (PyString_Check(v)) {
				4685	if (PyString_GET_SIZE(v) != 1)
				4686	goto onError;
				4687	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4688	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4689
				4690	else {
				4691	/* Integer input truncated to a character */
				4692	long x;
				4693	x = PyInt_AsLong(v);
				4694	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4695	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4696	buf[0] = (char) x;
				4697	}
				4698	buf[1] = '\0';
				4699	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4700
				4701	onError:
				4702	PyErr_SetString(PyExc_TypeError,
				4703	"%c requires int or char");
				4704	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4705	}
				4706
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4707	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4708
				4709	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4710	chars are formatted. XXX This is a magic number. Each formatting
				4711	routine does bounds checking to ensure no overflow, but a better
				4712	solution may be to malloc a buffer of appropriate size for each
				4713	format. For now, the current solution is sufficient.
				4714	*/
				4715	#define FORMATBUFLEN (size_t)120
				4716
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4717	PyObject PyUnicode_Format(PyObject format,
				4718	PyObject *args)
				4719	{
				4720	Py_UNICODE fmt, res;
				4721	int fmtcnt, rescnt, reslen, arglen, argidx;
				4722	int args_owned = 0;
				4723	PyUnicodeObject *result = NULL;
				4724	PyObject *dict = NULL;
				4725	PyObject *uformat;
				4726
				4727	if (format == NULL \|\| args == NULL) {
				4728	PyErr_BadInternalCall();
				4729	return NULL;
				4730	}
				4731	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4732	if (uformat == NULL)
				4733	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4734	fmt = PyUnicode_AS_UNICODE(uformat);
				4735	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4736
				4737	reslen = rescnt = fmtcnt + 100;
				4738	result = _PyUnicode_New(reslen);
				4739	if (result == NULL)
				4740	goto onError;
				4741	res = PyUnicode_AS_UNICODE(result);
				4742
				4743	if (PyTuple_Check(args)) {
				4744	arglen = PyTuple_Size(args);
				4745	argidx = 0;
				4746	}
				4747	else {
				4748	arglen = -1;
				4749	argidx = -2;
				4750	}
				4751	if (args->ob_type->tp_as_mapping)
				4752	dict = args;
				4753
				4754	while (--fmtcnt >= 0) {
				4755	if (*fmt != '%') {
				4756	if (--rescnt < 0) {
				4757	rescnt = fmtcnt + 100;
				4758	reslen += rescnt;
				4759	if (_PyUnicode_Resize(result, reslen) < 0)
				4760	return NULL;
				4761	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4762	--rescnt;
				4763	}
				4764	res++ = fmt++;
				4765	}
				4766	else {
				4767	/* Got a format specifier */
				4768	int flags = 0;
				4769	int width = -1;
				4770	int prec = -1;
				4771	int size = 0;
				4772	Py_UNICODE c = '\0';
				4773	Py_UNICODE fill;
				4774	PyObject *v = NULL;
				4775	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4776	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4777	Py_UNICODE sign;
				4778	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4779	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4780
				4781	fmt++;
				4782	if (*fmt == '(') {
				4783	Py_UNICODE *keystart;
				4784	int keylen;
				4785	PyObject *key;
				4786	int pcount = 1;
				4787
				4788	if (dict == NULL) {
				4789	PyErr_SetString(PyExc_TypeError,
				4790	"format requires a mapping");
				4791	goto onError;
				4792	}
				4793	++fmt;
				4794	--fmtcnt;
				4795	keystart = fmt;
				4796	/* Skip over balanced parentheses */
				4797	while (pcount > 0 && --fmtcnt >= 0) {
				4798	if (*fmt == ')')
				4799	--pcount;
				4800	else if (*fmt == '(')
				4801	++pcount;
				4802	fmt++;
				4803	}
				4804	keylen = fmt - keystart - 1;
				4805	if (fmtcnt < 0 \|\| pcount > 0) {
				4806	PyErr_SetString(PyExc_ValueError,
				4807	"incomplete format key");
				4808	goto onError;
				4809	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4810	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4811	then looked up since Python uses strings to hold
				4812	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4813	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4814	key = PyUnicode_EncodeUTF8(keystart,
				4815	keylen,
				4816	NULL);
				4817	if (key == NULL)
				4818	goto onError;
				4819	if (args_owned) {
				4820	Py_DECREF(args);
				4821	args_owned = 0;
				4822	}
				4823	args = PyObject_GetItem(dict, key);
				4824	Py_DECREF(key);
				4825	if (args == NULL) {
				4826	goto onError;
				4827	}
				4828	args_owned = 1;
				4829	arglen = -1;
				4830	argidx = -2;
				4831	}
				4832	while (--fmtcnt >= 0) {
				4833	switch (c = *fmt++) {
				4834	case '-': flags \|= F_LJUST; continue;
				4835	case '+': flags \|= F_SIGN; continue;
				4836	case ' ': flags \|= F_BLANK; continue;
				4837	case '#': flags \|= F_ALT; continue;
				4838	case '0': flags \|= F_ZERO; continue;
				4839	}
				4840	break;
				4841	}
				4842	if (c == '*') {
				4843	v = getnextarg(args, arglen, &argidx);
				4844	if (v == NULL)
				4845	goto onError;
				4846	if (!PyInt_Check(v)) {
				4847	PyErr_SetString(PyExc_TypeError,
				4848	"* wants int");
				4849	goto onError;
				4850	}
				4851	width = PyInt_AsLong(v);
				4852	if (width < 0) {
				4853	flags \|= F_LJUST;
				4854	width = -width;
				4855	}
				4856	if (--fmtcnt >= 0)
				4857	c = *fmt++;
				4858	}
				4859	else if (c >= '0' && c <= '9') {
				4860	width = c - '0';
				4861	while (--fmtcnt >= 0) {
				4862	c = *fmt++;
				4863	if (c < '0' \|\| c > '9')
				4864	break;
				4865	if ((width*10) / 10 != width) {
				4866	PyErr_SetString(PyExc_ValueError,
				4867	"width too big");
				4868	goto onError;
				4869	}
				4870	width = width*10 + (c - '0');
				4871	}
				4872	}
				4873	if (c == '.') {
				4874	prec = 0;
				4875	if (--fmtcnt >= 0)
				4876	c = *fmt++;
				4877	if (c == '*') {
				4878	v = getnextarg(args, arglen, &argidx);
				4879	if (v == NULL)
				4880	goto onError;
				4881	if (!PyInt_Check(v)) {
				4882	PyErr_SetString(PyExc_TypeError,
				4883	"* wants int");
				4884	goto onError;
				4885	}
				4886	prec = PyInt_AsLong(v);
				4887	if (prec < 0)
				4888	prec = 0;
				4889	if (--fmtcnt >= 0)
				4890	c = *fmt++;
				4891	}
				4892	else if (c >= '0' && c <= '9') {
				4893	prec = c - '0';
				4894	while (--fmtcnt >= 0) {
				4895	c = Py_CHARMASK(*fmt++);
				4896	if (c < '0' \|\| c > '9')
				4897	break;
				4898	if ((prec*10) / 10 != prec) {
				4899	PyErr_SetString(PyExc_ValueError,
				4900	"prec too big");
				4901	goto onError;
				4902	}
				4903	prec = prec*10 + (c - '0');
				4904	}
				4905	}
				4906	} /* prec */
				4907	if (fmtcnt >= 0) {
				4908	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4909	size = c;
				4910	if (--fmtcnt >= 0)
				4911	c = *fmt++;
				4912	}
				4913	}
				4914	if (fmtcnt < 0) {
				4915	PyErr_SetString(PyExc_ValueError,
				4916	"incomplete format");
				4917	goto onError;
				4918	}
				4919	if (c != '%') {
				4920	v = getnextarg(args, arglen, &argidx);
				4921	if (v == NULL)
				4922	goto onError;
				4923	}
				4924	sign = 0;
				4925	fill = ' ';
				4926	switch (c) {
				4927
				4928	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4929	pbuf = formatbuf;
				4930	/* presume that buffer length is at least 1 */
				4931	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4932	len = 1;
				4933	break;
				4934
				4935	case 's':
				4936	case 'r':
				4937	if (PyUnicode_Check(v) && c == 's') {
				4938	temp = v;
				4939	Py_INCREF(temp);
				4940	}
				4941	else {
				4942	PyObject *unicode;
				4943	if (c == 's')
				4944	temp = PyObject_Str(v);
				4945	else
				4946	temp = PyObject_Repr(v);
				4947	if (temp == NULL)
				4948	goto onError;
				4949	if (!PyString_Check(temp)) {
				4950	/* XXX Note: this should never happen, since
				4951	PyObject_Repr() and PyObject_Str() assure
				4952	this */
				4953	Py_DECREF(temp);
				4954	PyErr_SetString(PyExc_TypeError,
				4955	"%s argument has non-string str()");
				4956	goto onError;
				4957	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4958	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4959	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4960	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4961	"strict");
				4962	Py_DECREF(temp);
				4963	temp = unicode;
				4964	if (temp == NULL)
				4965	goto onError;
				4966	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4967	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4968	len = PyUnicode_GET_SIZE(temp);
				4969	if (prec >= 0 && len > prec)
				4970	len = prec;
				4971	break;
				4972
				4973	case 'i':
				4974	case 'd':
				4975	case 'u':
				4976	case 'o':
				4977	case 'x':
				4978	case 'X':
				4979	if (c == 'i')
				4980	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4981	pbuf = formatbuf;
				4982	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4983	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4984	if (len < 0)
				4985	goto onError;
				4986	sign = (c == 'd');
				4987	if (flags & F_ZERO) {
				4988	fill = '0';
				4989	if ((flags&F_ALT) &&
				4990	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4991	pbuf[0] == '0' && pbuf[1] == c) {
				4992	res++ = pbuf++;
				4993	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4994	rescnt -= 2;
				4995	len -= 2;
				4996	width -= 2;
				4997	if (width < 0)
				4998	width = 0;
				4999	}
				5000	}
				5001	break;
				5002
				5003	case 'e':
				5004	case 'E':
				5005	case 'f':
				5006	case 'g':
				5007	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5008	pbuf = formatbuf;
				5009	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5010	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5011	if (len < 0)
				5012	goto onError;
				5013	sign = 1;
				5014	if (flags&F_ZERO)
				5015	fill = '0';
				5016	break;
				5017
				5018	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5019	pbuf = formatbuf;
				5020	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5021	if (len < 0)
				5022	goto onError;
				5023	break;
				5024
				5025	default:
				5026	PyErr_Format(PyExc_ValueError,
				5027	"unsupported format character '%c' (0x%x)",
				5028	c, c);
				5029	goto onError;
				5030	}
				5031	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5032	if (pbuf == '-' \|\| pbuf == '+') {
				5033	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5034	len--;
				5035	}
				5036	else if (flags & F_SIGN)
				5037	sign = '+';
				5038	else if (flags & F_BLANK)
				5039	sign = ' ';
				5040	else
				5041	sign = 0;
				5042	}
				5043	if (width < len)
				5044	width = len;
				5045	if (rescnt < width + (sign != 0)) {
				5046	reslen -= rescnt;
				5047	rescnt = width + fmtcnt + 100;
				5048	reslen += rescnt;
				5049	if (_PyUnicode_Resize(result, reslen) < 0)
				5050	return NULL;
				5051	res = PyUnicode_AS_UNICODE(result)
				5052	+ reslen - rescnt;
				5053	}
				5054	if (sign) {
				5055	if (fill != ' ')
				5056	*res++ = sign;
				5057	rescnt--;
				5058	if (width > len)
				5059	width--;
				5060	}
				5061	if (width > len && !(flags & F_LJUST)) {
				5062	do {
				5063	--rescnt;
				5064	*res++ = fill;
				5065	} while (--width > len);
				5066	}
				5067	if (sign && fill == ' ')
				5068	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5069	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5070	res += len;
				5071	rescnt -= len;
				5072	while (--width >= len) {
				5073	--rescnt;
				5074	*res++ = ' ';
				5075	}
				5076	if (dict && (argidx < arglen) && c != '%') {
				5077	PyErr_SetString(PyExc_TypeError,
				5078	"not all arguments converted");
				5079	goto onError;
				5080	}
				5081	Py_XDECREF(temp);
				5082	} /* '%' */
				5083	} /* until end */
				5084	if (argidx < arglen && !dict) {
				5085	PyErr_SetString(PyExc_TypeError,
				5086	"not all arguments converted");
				5087	goto onError;
				5088	}
				5089
				5090	if (args_owned) {
				5091	Py_DECREF(args);
				5092	}
				5093	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5094	if (_PyUnicode_Resize(result, reslen - rescnt))
				5095	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5096	return (PyObject *)result;
				5097
				5098	onError:
				5099	Py_XDECREF(result);
				5100	Py_DECREF(uformat);
				5101	if (args_owned) {
				5102	Py_DECREF(args);
				5103	}
				5104	return NULL;
				5105	}
				5106
				5107	static PyBufferProcs unicode_as_buffer = {
				5108	(getreadbufferproc) unicode_buffer_getreadbuf,
				5109	(getwritebufferproc) unicode_buffer_getwritebuf,
				5110	(getsegcountproc) unicode_buffer_getsegcount,
				5111	(getcharbufferproc) unicode_buffer_getcharbuf,
				5112	};
				5113
				5114	PyTypeObject PyUnicode_Type = {
				5115	PyObject_HEAD_INIT(&PyType_Type)
				5116	0, /* ob_size */
				5117	"unicode", /* tp_name */
				5118	sizeof(PyUnicodeObject), /* tp_size */
				5119	0, /* tp_itemsize */
				5120	/* Slots */
				5121	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5122	0, /* tp_print */
				5123	(getattrfunc)unicode_getattr, /* tp_getattr */
				5124	0, /* tp_setattr */
				5125	(cmpfunc) unicode_compare, /* tp_compare */
				5126	(reprfunc) unicode_repr, /* tp_repr */
				5127	0, /* tp_as_number */
				5128	&unicode_as_sequence, /* tp_as_sequence */
				5129	0, /* tp_as_mapping */
				5130	(hashfunc) unicode_hash, /* tp_hash*/
				5131	0, /* tp_call*/
				5132	(reprfunc) unicode_str, /* tp_str */
				5133	(getattrofunc) NULL, /* tp_getattro */
				5134	(setattrofunc) NULL, /* tp_setattro */
				5135	&unicode_as_buffer, /* tp_as_buffer */
				5136	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5137	};
				5138
				5139	/* Initialize the Unicode implementation */
				5140
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5141	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5142	{
				5143	/* Doublecheck the configuration... */
				5144	if (sizeof(Py_UNICODE) != 2)
				5145	Py_FatalError("Unicode configuration error: "
				5146	"sizeof(Py_UNICODE) != 2 bytes");
				5147
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5148	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5149	unicode_freelist = NULL;
				5150	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5151	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5152	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5153	}
				5154
				5155	/* Finalize the Unicode implementation */
				5156
				5157	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5158	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5159	{
				5160	PyUnicodeObject *u = unicode_freelist;
				5161
				5162	while (u != NULL) {
				5163	PyUnicodeObject *v = u;
				5164	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5165	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5166	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5167	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5168	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5169	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5170	unicode_freelist = NULL;
				5171	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5172	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5173	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5174	}