Blame - Objects/unicodeobject.c - platform/external/python/cpython2

blob: b3c8ba4790f88041a762b8aa969332f905c51703 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
				86	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	88
				89	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	90	static PyUnicodeObject *unicode_freelist;
				91	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	93	/* Default encoding to use and assume when NULL is passed as encoding
				94	parameter; it is initialized by _PyUnicode_Init().
				95
				96	Always use the PyUnicode_SetDefaultEncoding() and
				97	PyUnicode_GetDefaultEncoding() APIs to access this global.
				98
				99	*/
				100
				101	static char unicode_default_encoding[100];
				102
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103	/* --- Unicode Object ----------------------------------------------------- */
				104
				105	static
				106	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				107	int length)
				108	{
				109	void *oldstr;
				110
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	111	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	112	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	113	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Resizing unicode_empty is not allowed. */
				116	if (unicode == unicode_empty) {
				117	PyErr_SetString(PyExc_SystemError,
				118	"can't resize empty unicode object");
				119	return -1;
				120	}
				121
				122	/* We allocate one more byte to make sure the string is
				123	Ux0000 terminated -- XXX is this needed ? */
				124	oldstr = unicode->str;
				125	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				126	if (!unicode->str) {
				127	unicode->str = oldstr;
				128	PyErr_NoMemory();
				129	return -1;
				130	}
				131	unicode->str[length] = 0;
				132	unicode->length = length;
				133
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	134	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	136	if (unicode->defenc) {
				137	Py_DECREF(unicode->defenc);
				138	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	}
				140	unicode->hash = -1;
				141
				142	return 0;
				143	}
				144
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	int PyUnicode_Resize(PyObject **unicode,
				146	int length)
				147	{
				148	PyUnicodeObject *v;
				149
				150	if (unicode == NULL) {
				151	PyErr_BadInternalCall();
				152	return -1;
				153	}
				154	v = (PyUnicodeObject )unicode;
				155	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				156	PyErr_BadInternalCall();
				157	return -1;
				158	}
				159	return _PyUnicode_Resize(v, length);
				160	}
				161
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	162	/* We allocate one more byte to make sure the string is
				163	Ux0000 terminated -- XXX is this needed ?
				164
				165	XXX This allocator could further be enhanced by assuring that the
				166	free list never reduces its size below 1.
				167
				168	*/
				169
				170	static
				171	PyUnicodeObject *_PyUnicode_New(int length)
				172	{
				173	register PyUnicodeObject *unicode;
				174
				175	/* Optimization for empty strings */
				176	if (length == 0 && unicode_empty != NULL) {
				177	Py_INCREF(unicode_empty);
				178	return unicode_empty;
				179	}
				180
				181	/* Unicode freelist & memory allocation */
				182	if (unicode_freelist) {
				183	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	184	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	185	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	186	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	187	/* Keep-Alive optimization: we only upsize the buffer,
				188	never downsize it. */
				189	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	190	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	191	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	192	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	193	}
				194	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	195	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	197	}
				198	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	}
				200	else {
				201	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				202	if (unicode == NULL)
				203	return NULL;
				204	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				205	}
				206
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	207	if (!unicode->str) {
				208	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	209	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode->str[length] = 0;
				212	unicode->length = length;
				213	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	214	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	215	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	216
				217	onError:
				218	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	219	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	220	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	}
				222
				223	static
				224	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				225	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	227	/* Keep-Alive optimization */
				228	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	229	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	230	unicode->str = NULL;
				231	unicode->length = 0;
				232	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	233	if (unicode->defenc) {
				234	Py_DECREF(unicode->defenc);
				235	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	236	}
				237	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	(PyUnicodeObject *)unicode = unicode_freelist;
				239	unicode_freelist = unicode;
				240	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	}
				242	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	243	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	244	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	}
				247	}
				248
				249	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				250	int size)
				251	{
				252	PyUnicodeObject *unicode;
				253
				254	unicode = _PyUnicode_New(size);
				255	if (!unicode)
				256	return NULL;
				257
				258	/* Copy the Unicode data into the new object */
				259	if (u != NULL)
				260	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				261
				262	return (PyObject *)unicode;
				263	}
				264
				265	#ifdef HAVE_WCHAR_H
				266
				267	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				268	int size)
				269	{
				270	PyUnicodeObject *unicode;
				271
				272	if (w == NULL) {
				273	PyErr_BadInternalCall();
				274	return NULL;
				275	}
				276
				277	unicode = _PyUnicode_New(size);
				278	if (!unicode)
				279	return NULL;
				280
				281	/* Copy the wchar_t data into the new object */
				282	#ifdef HAVE_USABLE_WCHAR_T
				283	memcpy(unicode->str, w, size * sizeof(wchar_t));
				284	#else
				285	{
				286	register Py_UNICODE *u;
				287	register int i;
				288	u = PyUnicode_AS_UNICODE(unicode);
				289	for (i = size; i >= 0; i--)
				290	u++ = w++;
				291	}
				292	#endif
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				298	register wchar_t *w,
				299	int size)
				300	{
				301	if (unicode == NULL) {
				302	PyErr_BadInternalCall();
				303	return -1;
				304	}
				305	if (size > PyUnicode_GET_SIZE(unicode))
				306	size = PyUnicode_GET_SIZE(unicode);
				307	#ifdef HAVE_USABLE_WCHAR_T
				308	memcpy(w, unicode->str, size * sizeof(wchar_t));
				309	#else
				310	{
				311	register Py_UNICODE *u;
				312	register int i;
				313	u = PyUnicode_AS_UNICODE(unicode);
				314	for (i = size; i >= 0; i--)
				315	w++ = u++;
				316	}
				317	#endif
				318
				319	return size;
				320	}
				321
				322	#endif
				323
				324	PyObject PyUnicode_FromObject(register PyObject obj)
				325	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	326	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				327	}
				328
				329	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				330	const char *encoding,
				331	const char *errors)
				332	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333	const char *s;
				334	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	335	int owned = 0;
				336	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	337
				338	if (obj == NULL) {
				339	PyErr_BadInternalCall();
				340	return NULL;
				341	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	342
				343	/* Coerce object */
				344	if (PyInstance_Check(obj)) {
				345	PyObject *func;
				346	func = PyObject_GetAttrString(obj, "__str__");
				347	if (func == NULL) {
				348	PyErr_SetString(PyExc_TypeError,
				349	"coercing to Unicode: instance doesn't define __str__");
				350	return NULL;
				351	}
				352	obj = PyEval_CallObject(func, NULL);
				353	Py_DECREF(func);
				354	if (obj == NULL)
				355	return NULL;
				356	owned = 1;
				357	}
				358	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	360	v = obj;
				361	if (encoding) {
				362	PyErr_SetString(PyExc_TypeError,
				363	"decoding Unicode is not supported");
				364	return NULL;
				365	}
				366	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	376	PyErr_Format(PyExc_TypeError,
				377	"coercing to Unicode: need string or buffer, "
				378	"%.80s found",
				379	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	380	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	381	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	382
				383	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	384	if (len == 0) {
				385	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	387	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	388	else
				389	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	390
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	391	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	392	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	394	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	395	return v;
				396
				397	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	398	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	399	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	400	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	401	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	}
				403
				404	PyObject PyUnicode_Decode(const char s,
				405	int size,
				406	const char *encoding,
				407	const char *errors)
				408	{
				409	PyObject buffer = NULL, unicode;
				410
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	411	if (encoding == NULL)
				412	encoding = PyUnicode_GetDefaultEncoding();
				413
				414	/* Shortcuts for common default encodings */
				415	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	417	else if (strcmp(encoding, "latin-1") == 0)
				418	return PyUnicode_DecodeLatin1(s, size, errors);
				419	else if (strcmp(encoding, "ascii") == 0)
				420	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	421
				422	/* Decode via the codec registry */
				423	buffer = PyBuffer_FromMemory((void *)s, size);
				424	if (buffer == NULL)
				425	goto onError;
				426	unicode = PyCodec_Decode(buffer, encoding, errors);
				427	if (unicode == NULL)
				428	goto onError;
				429	if (!PyUnicode_Check(unicode)) {
				430	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	431	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	432	unicode->ob_type->tp_name);
				433	Py_DECREF(unicode);
				434	goto onError;
				435	}
				436	Py_DECREF(buffer);
				437	return unicode;
				438
				439	onError:
				440	Py_XDECREF(buffer);
				441	return NULL;
				442	}
				443
				444	PyObject PyUnicode_Encode(const Py_UNICODE s,
				445	int size,
				446	const char *encoding,
				447	const char *errors)
				448	{
				449	PyObject v, unicode;
				450
				451	unicode = PyUnicode_FromUnicode(s, size);
				452	if (unicode == NULL)
				453	return NULL;
				454	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				455	Py_DECREF(unicode);
				456	return v;
				457	}
				458
				459	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				460	const char *encoding,
				461	const char *errors)
				462	{
				463	PyObject *v;
				464
				465	if (!PyUnicode_Check(unicode)) {
				466	PyErr_BadArgument();
				467	goto onError;
				468	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	469
				470	if (encoding == NULL)
				471	encoding = PyUnicode_GetDefaultEncoding();
				472
				473	/* Shortcuts for common default encodings */
				474	if (errors == NULL) {
				475	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	477	else if (strcmp(encoding, "latin-1") == 0)
				478	return PyUnicode_AsLatin1String(unicode);
				479	else if (strcmp(encoding, "ascii") == 0)
				480	return PyUnicode_AsASCIIString(unicode);
				481	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	482
				483	/* Encode via the codec registry */
				484	v = PyCodec_Encode(unicode, encoding, errors);
				485	if (v == NULL)
				486	goto onError;
				487	/* XXX Should we really enforce this ? */
				488	if (!PyString_Check(v)) {
				489	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	490	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	v->ob_type->tp_name);
				492	Py_DECREF(v);
				493	goto onError;
				494	}
				495	return v;
				496
				497	onError:
				498	return NULL;
				499	}
				500
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	501	/* Return a Python string holding the default encoded value of the
				502	Unicode object.
				503
				504	The resulting string is cached in the Unicode object for subsequent
				505	usage by this function. The cached version is needed to implement
				506	the character buffer interface and will live (at least) as long as
				507	the Unicode object itself.
				508
				509	The refcount of the string is not incremented.
				510
				511	* Exported for internal use by the interpreter only !!! *
				512
				513	*/
				514
				515	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				516	const char *errors)
				517	{
				518	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				519
				520	if (v)
				521	return v;
				522	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				523	if (v && errors == NULL)
				524	((PyUnicodeObject *)unicode)->defenc = v;
				525	return v;
				526	}
				527
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	528	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				529	{
				530	if (!PyUnicode_Check(unicode)) {
				531	PyErr_BadArgument();
				532	goto onError;
				533	}
				534	return PyUnicode_AS_UNICODE(unicode);
				535
				536	onError:
				537	return NULL;
				538	}
				539
				540	int PyUnicode_GetSize(PyObject *unicode)
				541	{
				542	if (!PyUnicode_Check(unicode)) {
				543	PyErr_BadArgument();
				544	goto onError;
				545	}
				546	return PyUnicode_GET_SIZE(unicode);
				547
				548	onError:
				549	return -1;
				550	}
				551
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	552	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	{
				554	return unicode_default_encoding;
				555	}
				556
				557	int PyUnicode_SetDefaultEncoding(const char *encoding)
				558	{
				559	PyObject *v;
				560
				561	/* Make sure the encoding is valid. As side effect, this also
				562	loads the encoding into the codec registry cache. */
				563	v = _PyCodec_Lookup(encoding);
				564	if (v == NULL)
				565	goto onError;
				566	Py_DECREF(v);
				567	strncpy(unicode_default_encoding,
				568	encoding,
				569	sizeof(unicode_default_encoding));
				570	return 0;
				571
				572	onError:
				573	return -1;
				574	}
				575
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	576	/* --- UTF-8 Codec -------------------------------------------------------- */
				577
				578	static
				579	char utf8_code_length[256] = {
				580	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				581	illegal prefix. see RFC 2279 for details */
				582	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				583	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				584	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				591	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				592	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				595	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				596	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				597	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				598	};
				599
				600	static
				601	int utf8_decoding_error(const char **source,
				602	Py_UNICODE **dest,
				603	const char *errors,
				604	const char *details)
				605	{
				606	if ((errors == NULL) \|\|
				607	(strcmp(errors,"strict") == 0)) {
				608	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	609	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	610	details);
				611	return -1;
				612	}
				613	else if (strcmp(errors,"ignore") == 0) {
				614	(*source)++;
				615	return 0;
				616	}
				617	else if (strcmp(errors,"replace") == 0) {
				618	(*source)++;
				619	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				620	(*dest)++;
				621	return 0;
				622	}
				623	else {
				624	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	625	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	626	errors);
				627	return -1;
				628	}
				629	}
				630
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	631	PyObject PyUnicode_DecodeUTF8(const char s,
				632	int size,
				633	const char *errors)
				634	{
				635	int n;
				636	const char *e;
				637	PyUnicodeObject *unicode;
				638	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	639	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	640
				641	/* Note: size will always be longer than the resulting Unicode
				642	character count */
				643	unicode = _PyUnicode_New(size);
				644	if (!unicode)
				645	return NULL;
				646	if (size == 0)
				647	return (PyObject *)unicode;
				648
				649	/* Unpack UTF-8 encoded data */
				650	p = unicode->str;
				651	e = s + size;
				652
				653	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	654	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	655
				656	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	657	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	658	s++;
				659	continue;
				660	}
				661
				662	n = utf8_code_length[ch];
				663
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	if (s + n > e) {
				665	errmsg = "unexpected end of data";
				666	goto utf8Error;
				667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	668
				669	switch (n) {
				670
				671	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	672	errmsg = "unexpected code byte";
				673	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	674
				675	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	676	errmsg = "internal error";
				677	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	678
				679	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	680	if ((s[1] & 0xc0) != 0x80) {
				681	errmsg = "invalid data";
				682	goto utf8Error;
				683	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	684	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	685	if (ch < 0x80) {
				686	errmsg = "illegal encoding";
				687	goto utf8Error;
				688	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	690	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	break;
				692
				693	case 3:
				694	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	695	(s[2] & 0xc0) != 0x80) {
				696	errmsg = "invalid data";
				697	goto utf8Error;
				698	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	700	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				701	errmsg = "illegal encoding";
				702	goto utf8Error;
				703	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	704	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	705	*p++ = (Py_UNICODE)ch;
				706	break;
				707
				708	case 4:
				709	if ((s[1] & 0xc0) != 0x80 \|\|
				710	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	711	(s[3] & 0xc0) != 0x80) {
				712	errmsg = "invalid data";
				713	goto utf8Error;
				714	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	715	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				716	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				717	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				719	byte encoding */
				720	(ch > 0x10ffff)) { /* maximum value allowed for
				721	UTF-16 */
				722	errmsg = "illegal encoding";
				723	goto utf8Error;
				724	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	725	/* compute and append the two surrogates: */
				726
				727	/* translate from 10000..10FFFF to 0..FFFF */
				728	ch -= 0x10000;
				729
				730	/* high surrogate = top 10 bits added to D800 */
				731	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				732
				733	/* low surrogate = bottom 10 bits added to DC00 */
				734	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	735	break;
				736
				737	default:
				738	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	739	errmsg = "unsupported Unicode code range";
				740	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	741	}
				742	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	743	continue;
				744
				745	utf8Error:
				746	if (utf8_decoding_error(&s, &p, errors, errmsg))
				747	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	748	}
				749
				750	/* Adjust length */
				751	if (_PyUnicode_Resize(unicode, p - unicode->str))
				752	goto onError;
				753
				754	return (PyObject *)unicode;
				755
				756	onError:
				757	Py_DECREF(unicode);
				758	return NULL;
				759	}
				760
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	761	/* Not used anymore, now that the encoder supports UTF-16
				762	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	763	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	764	static
				765	int utf8_encoding_error(const Py_UNICODE **source,
				766	char **dest,
				767	const char *errors,
				768	const char *details)
				769	{
				770	if ((errors == NULL) \|\|
				771	(strcmp(errors,"strict") == 0)) {
				772	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	773	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	774	details);
				775	return -1;
				776	}
				777	else if (strcmp(errors,"ignore") == 0) {
				778	return 0;
				779	}
				780	else if (strcmp(errors,"replace") == 0) {
				781	**dest = '?';
				782	(*dest)++;
				783	return 0;
				784	}
				785	else {
				786	PyErr_Format(PyExc_ValueError,
				787	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	788	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	789	errors);
				790	return -1;
				791	}
				792	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	793	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	794
				795	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				796	int size,
				797	const char *errors)
				798	{
				799	PyObject *v;
				800	char *p;
				801	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	802	Py_UCS4 ch2;
				803	unsigned int cbAllocated = 3 * size;
				804	unsigned int cbWritten = 0;
				805	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	806
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	807	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	if (v == NULL)
				809	return NULL;
				810	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	811	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	812
				813	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	814	while (i < size) {
				815	Py_UCS4 ch = s[i++];
				816	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	817	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	818	cbWritten++;
				819	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	820	else if (ch < 0x0800) {
				821	*p++ = 0xc0 \| (ch >> 6);
				822	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	823	cbWritten += 2;
				824	}
				825	else {
				826	/* Check for high surrogate */
				827	if (0xD800 <= ch && ch <= 0xDBFF) {
				828	if (i != size) {
				829	ch2 = s[i];
				830	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				831
				832	if (cbWritten >= (cbAllocated - 4)) {
				833	/* Provide enough room for some more
				834	surrogates */
				835	cbAllocated += 4*10;
				836	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	837	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	838	}
				839
				840	/* combine the two values */
				841	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				842
				843	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	844	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	845	i++;
				846	cbWritten += 4;
				847	}
				848	}
				849	}
				850	else {
				851	*p++ = (char)(0xe0 \| (ch >> 12));
				852	cbWritten += 3;
				853	}
				854	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				855	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	856	}
				857	}
				858	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	859	if (_PyString_Resize(&v, p - q))
				860	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	861	return v;
				862
				863	onError:
				864	Py_DECREF(v);
				865	return NULL;
				866	}
				867
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	868	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				869	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	870	if (!PyUnicode_Check(unicode)) {
				871	PyErr_BadArgument();
				872	return NULL;
				873	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	874	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				875	PyUnicode_GET_SIZE(unicode),
				876	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	877	}
				878
				879	/* --- UTF-16 Codec ------------------------------------------------------- */
				880
				881	static
				882	int utf16_decoding_error(const Py_UNICODE **source,
				883	Py_UNICODE **dest,
				884	const char *errors,
				885	const char *details)
				886	{
				887	if ((errors == NULL) \|\|
				888	(strcmp(errors,"strict") == 0)) {
				889	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	890	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	891	details);
				892	return -1;
				893	}
				894	else if (strcmp(errors,"ignore") == 0) {
				895	return 0;
				896	}
				897	else if (strcmp(errors,"replace") == 0) {
				898	if (dest) {
				899	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				900	(*dest)++;
				901	}
				902	return 0;
				903	}
				904	else {
				905	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	906	"UTF-16 decoding error; "
				907	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	908	errors);
				909	return -1;
				910	}
				911	}
				912
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	913	PyObject PyUnicode_DecodeUTF16(const char s,
				914	int size,
				915	const char *errors,
				916	int *byteorder)
				917	{
				918	PyUnicodeObject *unicode;
				919	Py_UNICODE *p;
				920	const Py_UNICODE q, e;
				921	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	922	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	923
				924	/* size should be an even number */
				925	if (size % sizeof(Py_UNICODE) != 0) {
				926	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				927	return NULL;
				928	/* The remaining input chars are ignored if we fall through
				929	here... */
				930	}
				931
				932	/* Note: size will always be longer than the resulting Unicode
				933	character count */
				934	unicode = _PyUnicode_New(size);
				935	if (!unicode)
				936	return NULL;
				937	if (size == 0)
				938	return (PyObject *)unicode;
				939
				940	/* Unpack UTF-16 encoded data */
				941	p = unicode->str;
				942	q = (Py_UNICODE *)s;
				943	e = q + (size / sizeof(Py_UNICODE));
				944
				945	if (byteorder)
				946	bo = *byteorder;
				947
				948	while (q < e) {
				949	register Py_UNICODE ch = *q++;
				950
				951	/* Check for BOM marks (U+FEFF) in the input and adjust
				952	current byte order setting accordingly. Swap input
				953	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				954	!) */
				955	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				956	if (ch == 0xFEFF) {
				957	bo = -1;
				958	continue;
				959	} else if (ch == 0xFFFE) {
				960	bo = 1;
				961	continue;
				962	}
				963	if (bo == 1)
				964	ch = (ch >> 8) \| (ch << 8);
				965	#else
				966	if (ch == 0xFEFF) {
				967	bo = 1;
				968	continue;
				969	} else if (ch == 0xFFFE) {
				970	bo = -1;
				971	continue;
				972	}
				973	if (bo == -1)
				974	ch = (ch >> 8) \| (ch << 8);
				975	#endif
				976	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				977	*p++ = ch;
				978	continue;
				979	}
				980
				981	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	982	if (q >= e) {
				983	errmsg = "unexpected end of data";
				984	goto utf16Error;
				985	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	986	if (0xDC00 <= q && q <= 0xDFFF) {
				987	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	988	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	989	/* This is valid data (a UTF-16 surrogate pair), but
				990	we are not able to store this information since our
				991	Py_UNICODE type only has 16 bits... this might
				992	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	993	errmsg = "code pairs are not supported";
				994	goto utf16Error;
				995	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	996	else
				997	continue;
				998	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	999	errmsg = "illegal encoding";
				1000	/* Fall through to report the error */
				1001
				1002	utf16Error:
				1003	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1004	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1005	}
				1006
				1007	if (byteorder)
				1008	*byteorder = bo;
				1009
				1010	/* Adjust length */
				1011	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1012	goto onError;
				1013
				1014	return (PyObject *)unicode;
				1015
				1016	onError:
				1017	Py_DECREF(unicode);
				1018	return NULL;
				1019	}
				1020
				1021	#undef UTF16_ERROR
				1022
				1023	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1024	int size,
				1025	const char *errors,
				1026	int byteorder)
				1027	{
				1028	PyObject *v;
				1029	Py_UNICODE *p;
				1030	char *q;
				1031
				1032	/* We don't create UTF-16 pairs... */
				1033	v = PyString_FromStringAndSize(NULL,
				1034	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1035	if (v == NULL)
				1036	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1037
				1038	q = PyString_AS_STRING(v);
				1039	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1040	if (byteorder == 0)
				1041	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1042	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1043	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1044	if (byteorder == 0 \|\|
				1045	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1046	byteorder == -1
				1047	#else
				1048	byteorder == 1
				1049	#endif
				1050	)
				1051	memcpy(p, s, size * sizeof(Py_UNICODE));
				1052	else
				1053	while (size-- > 0) {
				1054	Py_UNICODE ch = *s++;
				1055	*p++ = (ch >> 8) \| (ch << 8);
				1056	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	return v;
				1058	}
				1059
				1060	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1061	{
				1062	if (!PyUnicode_Check(unicode)) {
				1063	PyErr_BadArgument();
				1064	return NULL;
				1065	}
				1066	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1067	PyUnicode_GET_SIZE(unicode),
				1068	NULL,
				1069	0);
				1070	}
				1071
				1072	/* --- Unicode Escape Codec ----------------------------------------------- */
				1073
				1074	static
				1075	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1076	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1077	const char *errors,
				1078	const char *details)
				1079	{
				1080	if ((errors == NULL) \|\|
				1081	(strcmp(errors,"strict") == 0)) {
				1082	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1083	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1084	details);
				1085	return -1;
				1086	}
				1087	else if (strcmp(errors,"ignore") == 0) {
				1088	return 0;
				1089	}
				1090	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1091	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1092	return 0;
				1093	}
				1094	else {
				1095	PyErr_Format(PyExc_ValueError,
				1096	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1097	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1098	errors);
				1099	return -1;
				1100	}
				1101	}
				1102
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1103	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1104
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1105	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1106	int size,
				1107	const char *errors)
				1108	{
				1109	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1110	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1111	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1112	char* message;
				1113	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1114
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	/* Escaped strings will always be longer than the resulting
				1116	Unicode string, so we start with size here and then reduce the
				1117	length after conversion to the true value. */
				1118	v = _PyUnicode_New(size);
				1119	if (v == NULL)
				1120	goto onError;
				1121	if (size == 0)
				1122	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1123
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1124	p = buf = PyUnicode_AS_UNICODE(v);
				1125	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1126
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1127	while (s < end) {
				1128	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1129	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1130	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1131
				1132	/* Non-escape characters are interpreted as Unicode ordinals */
				1133	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1134	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1135	continue;
				1136	}
				1137
				1138	/* \ - Escapes */
				1139	s++;
				1140	switch (*s++) {
				1141
				1142	/* \x escapes */
				1143	case '\n': break;
				1144	case '\\': *p++ = '\\'; break;
				1145	case '\'': *p++ = '\''; break;
				1146	case '\"': *p++ = '\"'; break;
				1147	case 'b': *p++ = '\b'; break;
				1148	case 'f': p++ = '\014'; break; / FF */
				1149	case 't': *p++ = '\t'; break;
				1150	case 'n': *p++ = '\n'; break;
				1151	case 'r': *p++ = '\r'; break;
				1152	case 'v': p++ = '\013'; break; / VT */
				1153	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1154
				1155	/* \OOO (octal) escapes */
				1156	case '0': case '1': case '2': case '3':
				1157	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1158	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1160	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1162	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1163	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1164	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1165	break;
				1166
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1167	/* hex escapes */
				1168	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1170	digits = 2;
				1171	message = "truncated \\xXX escape";
				1172	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1173
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1174	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1176	digits = 4;
				1177	message = "truncated \\uXXXX escape";
				1178	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1179
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1180	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1181	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1182	digits = 8;
				1183	message = "truncated \\UXXXXXXXX escape";
				1184	hexescape:
				1185	chr = 0;
				1186	for (i = 0; i < digits; i++) {
				1187	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1188	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1189	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1190	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1191	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1192	i++;
				1193	break;
				1194	}
				1195	chr = (chr<<4) & ~0xF;
				1196	if (c >= '0' && c <= '9')
				1197	chr += c - '0';
				1198	else if (c >= 'a' && c <= 'f')
				1199	chr += 10 + c - 'a';
				1200	else
				1201	chr += 10 + c - 'A';
				1202	}
				1203	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1204	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1205	/* when we get here, chr is a 32-bit unicode character */
				1206	if (chr <= 0xffff)
				1207	/* UCS-2 character */
				1208	*p++ = (Py_UNICODE) chr;
				1209	else if (chr <= 0x10ffff) {
				1210	/* UCS-4 character. store as two surrogate characters */
				1211	chr -= 0x10000L;
				1212	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1213	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1214	} else {
				1215	if (unicodeescape_decoding_error(
				1216	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1217	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1218	)
				1219	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1220	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1221	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1222	break;
				1223
				1224	/* \N{name} */
				1225	case 'N':
				1226	message = "malformed \\N character escape";
				1227	if (ucnhash_CAPI == NULL) {
				1228	/* load the unicode data module */
				1229	PyObject m, v;
				1230	m = PyImport_ImportModule("unicodedata");
				1231	if (m == NULL)
				1232	goto ucnhashError;
				1233	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1234	Py_DECREF(m);
				1235	if (v == NULL)
				1236	goto ucnhashError;
				1237	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1238	Py_DECREF(v);
				1239	if (ucnhash_CAPI == NULL)
				1240	goto ucnhashError;
				1241	}
				1242	if (*s == '{') {
				1243	const char *start = s+1;
				1244	/* look for the closing brace */
				1245	while (*s != '}' && s < end)
				1246	s++;
				1247	if (s > start && s < end && *s == '}') {
				1248	/* found a name. look it up in the unicode database */
				1249	message = "unknown Unicode character name";
				1250	s++;
				1251	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1252	goto store;
				1253	}
				1254	}
				1255	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1256	goto onError;
				1257	*p++ = x;
				1258	break;
				1259
				1260	default:
				1261	*p++ = '\\';
				1262	*p++ = (unsigned char)s[-1];
				1263	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1264	}
				1265	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1266	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1267	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1268	return (PyObject *)v;
				1269
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1270	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1271	PyErr_SetString(
				1272	PyExc_UnicodeError,
				1273	"\\N escapes not supported (can't load unicodedata module)"
				1274	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1275	return NULL;
				1276
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1277	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1278	Py_XDECREF(v);
				1279	return NULL;
				1280	}
				1281
				1282	/* Return a Unicode-Escape string version of the Unicode object.
				1283
				1284	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1285	appropriate.
				1286
				1287	*/
				1288
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1289	static const Py_UNICODE findchar(const Py_UNICODE s,
				1290	int size,
				1291	Py_UNICODE ch);
				1292
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1293	static
				1294	PyObject unicodeescape_string(const Py_UNICODE s,
				1295	int size,
				1296	int quotes)
				1297	{
				1298	PyObject *repr;
				1299	char *p;
				1300	char *q;
				1301
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1302	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1303
				1304	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1305	if (repr == NULL)
				1306	return NULL;
				1307
				1308	p = q = PyString_AS_STRING(repr);
				1309
				1310	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1311	*p++ = 'u';
				1312	*p++ = (findchar(s, size, '\'') &&
				1313	!findchar(s, size, '"')) ? '"' : '\'';
				1314	}
				1315	while (size-- > 0) {
				1316	Py_UNICODE ch = *s++;
				1317	/* Escape quotes */
				1318	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1319	*p++ = '\\';
				1320	*p++ = (char) ch;
				1321	}
				1322	/* Map 16-bit characters to '\uxxxx' */
				1323	else if (ch >= 256) {
				1324	*p++ = '\\';
				1325	*p++ = 'u';
				1326	*p++ = hexdigit[(ch >> 12) & 0xf];
				1327	*p++ = hexdigit[(ch >> 8) & 0xf];
				1328	*p++ = hexdigit[(ch >> 4) & 0xf];
				1329	*p++ = hexdigit[ch & 15];
				1330	}
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1331	/* Map special whitespace to '\t', \n', '\r' */
				1332	else if (ch == '\t') {
				1333	*p++ = '\\';
				1334	*p++ = 't';
				1335	}
				1336	else if (ch == '\n') {
				1337	*p++ = '\\';
				1338	*p++ = 'n';
				1339	}
				1340	else if (ch == '\r') {
				1341	*p++ = '\\';
				1342	*p++ = 'r';
				1343	}
				1344	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1345	else if (ch < ' ' \|\| ch >= 128) {
				1346	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1347	*p++ = 'x';
				1348	*p++ = hexdigit[(ch >> 4) & 0xf];
				1349	*p++ = hexdigit[ch & 15];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1350	}
				1351	/* Copy everything else as-is */
				1352	else
				1353	*p++ = (char) ch;
				1354	}
				1355	if (quotes)
				1356	*p++ = q[1];
				1357
				1358	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1359	if (_PyString_Resize(&repr, p - q))
				1360	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1361
				1362	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1363
				1364	onError:
				1365	Py_DECREF(repr);
				1366	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1367	}
				1368
				1369	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1370	int size)
				1371	{
				1372	return unicodeescape_string(s, size, 0);
				1373	}
				1374
				1375	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1376	{
				1377	if (!PyUnicode_Check(unicode)) {
				1378	PyErr_BadArgument();
				1379	return NULL;
				1380	}
				1381	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1382	PyUnicode_GET_SIZE(unicode));
				1383	}
				1384
				1385	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1386
				1387	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1388	int size,
				1389	const char *errors)
				1390	{
				1391	PyUnicodeObject *v;
				1392	Py_UNICODE p, buf;
				1393	const char *end;
				1394	const char *bs;
				1395
				1396	/* Escaped strings will always be longer than the resulting
				1397	Unicode string, so we start with size here and then reduce the
				1398	length after conversion to the true value. */
				1399	v = _PyUnicode_New(size);
				1400	if (v == NULL)
				1401	goto onError;
				1402	if (size == 0)
				1403	return (PyObject *)v;
				1404	p = buf = PyUnicode_AS_UNICODE(v);
				1405	end = s + size;
				1406	while (s < end) {
				1407	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1408	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1409	int i;
				1410
				1411	/* Non-escape characters are interpreted as Unicode ordinals */
				1412	if (*s != '\\') {
				1413	p++ = (unsigned char)s++;
				1414	continue;
				1415	}
				1416
				1417	/* \u-escapes are only interpreted iff the number of leading
				1418	backslashes if odd */
				1419	bs = s;
				1420	for (;s < end;) {
				1421	if (*s != '\\')
				1422	break;
				1423	p++ = (unsigned char)s++;
				1424	}
				1425	if (((s - bs) & 1) == 0 \|\|
				1426	s >= end \|\|
				1427	*s != 'u') {
				1428	continue;
				1429	}
				1430	p--;
				1431	s++;
				1432
				1433	/* \uXXXX with 4 hex digits */
				1434	for (x = 0, i = 0; i < 4; i++) {
				1435	c = (unsigned char)s[i];
				1436	if (!isxdigit(c)) {
				1437	if (unicodeescape_decoding_error(&s, &x, errors,
				1438	"truncated \\uXXXX"))
				1439	goto onError;
				1440	i++;
				1441	break;
				1442	}
				1443	x = (x<<4) & ~0xF;
				1444	if (c >= '0' && c <= '9')
				1445	x += c - '0';
				1446	else if (c >= 'a' && c <= 'f')
				1447	x += 10 + c - 'a';
				1448	else
				1449	x += 10 + c - 'A';
				1450	}
				1451	s += i;
				1452	*p++ = x;
				1453	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1454	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1455	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1456	return (PyObject *)v;
				1457
				1458	onError:
				1459	Py_XDECREF(v);
				1460	return NULL;
				1461	}
				1462
				1463	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1464	int size)
				1465	{
				1466	PyObject *repr;
				1467	char *p;
				1468	char *q;
				1469
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1470	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1471
				1472	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1473	if (repr == NULL)
				1474	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1475	if (size == 0)
				1476	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1477
				1478	p = q = PyString_AS_STRING(repr);
				1479	while (size-- > 0) {
				1480	Py_UNICODE ch = *s++;
				1481	/* Map 16-bit characters to '\uxxxx' */
				1482	if (ch >= 256) {
				1483	*p++ = '\\';
				1484	*p++ = 'u';
				1485	*p++ = hexdigit[(ch >> 12) & 0xf];
				1486	*p++ = hexdigit[(ch >> 8) & 0xf];
				1487	*p++ = hexdigit[(ch >> 4) & 0xf];
				1488	*p++ = hexdigit[ch & 15];
				1489	}
				1490	/* Copy everything else as-is */
				1491	else
				1492	*p++ = (char) ch;
				1493	}
				1494	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1495	if (_PyString_Resize(&repr, p - q))
				1496	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1497
				1498	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1499
				1500	onError:
				1501	Py_DECREF(repr);
				1502	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1503	}
				1504
				1505	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1506	{
				1507	if (!PyUnicode_Check(unicode)) {
				1508	PyErr_BadArgument();
				1509	return NULL;
				1510	}
				1511	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1512	PyUnicode_GET_SIZE(unicode));
				1513	}
				1514
				1515	/* --- Latin-1 Codec ------------------------------------------------------ */
				1516
				1517	PyObject PyUnicode_DecodeLatin1(const char s,
				1518	int size,
				1519	const char *errors)
				1520	{
				1521	PyUnicodeObject *v;
				1522	Py_UNICODE *p;
				1523
				1524	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1525	v = _PyUnicode_New(size);
				1526	if (v == NULL)
				1527	goto onError;
				1528	if (size == 0)
				1529	return (PyObject *)v;
				1530	p = PyUnicode_AS_UNICODE(v);
				1531	while (size-- > 0)
				1532	p++ = (unsigned char)s++;
				1533	return (PyObject *)v;
				1534
				1535	onError:
				1536	Py_XDECREF(v);
				1537	return NULL;
				1538	}
				1539
				1540	static
				1541	int latin1_encoding_error(const Py_UNICODE **source,
				1542	char **dest,
				1543	const char *errors,
				1544	const char *details)
				1545	{
				1546	if ((errors == NULL) \|\|
				1547	(strcmp(errors,"strict") == 0)) {
				1548	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1549	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1550	details);
				1551	return -1;
				1552	}
				1553	else if (strcmp(errors,"ignore") == 0) {
				1554	return 0;
				1555	}
				1556	else if (strcmp(errors,"replace") == 0) {
				1557	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1558	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1559	return 0;
				1560	}
				1561	else {
				1562	PyErr_Format(PyExc_ValueError,
				1563	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1564	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1565	errors);
				1566	return -1;
				1567	}
				1568	}
				1569
				1570	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1571	int size,
				1572	const char *errors)
				1573	{
				1574	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1575	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1576
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1577	repr = PyString_FromStringAndSize(NULL, size);
				1578	if (repr == NULL)
				1579	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1580	if (size == 0)
				1581	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1582
				1583	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1584	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1585	while (size-- > 0) {
				1586	Py_UNICODE ch = *p++;
				1587	if (ch >= 256) {
				1588	if (latin1_encoding_error(&p, &s, errors,
				1589	"ordinal not in range(256)"))
				1590	goto onError;
				1591	}
				1592	else
				1593	*s++ = (char)ch;
				1594	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1595	/* Resize if error handling skipped some characters */
				1596	if (s - start < PyString_GET_SIZE(repr))
				1597	if (_PyString_Resize(&repr, s - start))
				1598	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1599	return repr;
				1600
				1601	onError:
				1602	Py_DECREF(repr);
				1603	return NULL;
				1604	}
				1605
				1606	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1607	{
				1608	if (!PyUnicode_Check(unicode)) {
				1609	PyErr_BadArgument();
				1610	return NULL;
				1611	}
				1612	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1613	PyUnicode_GET_SIZE(unicode),
				1614	NULL);
				1615	}
				1616
				1617	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1618
				1619	static
				1620	int ascii_decoding_error(const char **source,
				1621	Py_UNICODE **dest,
				1622	const char *errors,
				1623	const char *details)
				1624	{
				1625	if ((errors == NULL) \|\|
				1626	(strcmp(errors,"strict") == 0)) {
				1627	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1628	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1629	details);
				1630	return -1;
				1631	}
				1632	else if (strcmp(errors,"ignore") == 0) {
				1633	return 0;
				1634	}
				1635	else if (strcmp(errors,"replace") == 0) {
				1636	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1637	(*dest)++;
				1638	return 0;
				1639	}
				1640	else {
				1641	PyErr_Format(PyExc_ValueError,
				1642	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1643	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1644	errors);
				1645	return -1;
				1646	}
				1647	}
				1648
				1649	PyObject PyUnicode_DecodeASCII(const char s,
				1650	int size,
				1651	const char *errors)
				1652	{
				1653	PyUnicodeObject *v;
				1654	Py_UNICODE *p;
				1655
				1656	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1657	v = _PyUnicode_New(size);
				1658	if (v == NULL)
				1659	goto onError;
				1660	if (size == 0)
				1661	return (PyObject *)v;
				1662	p = PyUnicode_AS_UNICODE(v);
				1663	while (size-- > 0) {
				1664	register unsigned char c;
				1665
				1666	c = (unsigned char)*s++;
				1667	if (c < 128)
				1668	*p++ = c;
				1669	else if (ascii_decoding_error(&s, &p, errors,
				1670	"ordinal not in range(128)"))
				1671	goto onError;
				1672	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1673	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1674	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1675	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1676	return (PyObject *)v;
				1677
				1678	onError:
				1679	Py_XDECREF(v);
				1680	return NULL;
				1681	}
				1682
				1683	static
				1684	int ascii_encoding_error(const Py_UNICODE **source,
				1685	char **dest,
				1686	const char *errors,
				1687	const char *details)
				1688	{
				1689	if ((errors == NULL) \|\|
				1690	(strcmp(errors,"strict") == 0)) {
				1691	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1692	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1693	details);
				1694	return -1;
				1695	}
				1696	else if (strcmp(errors,"ignore") == 0) {
				1697	return 0;
				1698	}
				1699	else if (strcmp(errors,"replace") == 0) {
				1700	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1701	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1702	return 0;
				1703	}
				1704	else {
				1705	PyErr_Format(PyExc_ValueError,
				1706	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1707	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1708	errors);
				1709	return -1;
				1710	}
				1711	}
				1712
				1713	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1714	int size,
				1715	const char *errors)
				1716	{
				1717	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1718	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1719
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1720	repr = PyString_FromStringAndSize(NULL, size);
				1721	if (repr == NULL)
				1722	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1723	if (size == 0)
				1724	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1725
				1726	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1727	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1728	while (size-- > 0) {
				1729	Py_UNICODE ch = *p++;
				1730	if (ch >= 128) {
				1731	if (ascii_encoding_error(&p, &s, errors,
				1732	"ordinal not in range(128)"))
				1733	goto onError;
				1734	}
				1735	else
				1736	*s++ = (char)ch;
				1737	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1738	/* Resize if error handling skipped some characters */
				1739	if (s - start < PyString_GET_SIZE(repr))
				1740	if (_PyString_Resize(&repr, s - start))
				1741	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1742	return repr;
				1743
				1744	onError:
				1745	Py_DECREF(repr);
				1746	return NULL;
				1747	}
				1748
				1749	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1750	{
				1751	if (!PyUnicode_Check(unicode)) {
				1752	PyErr_BadArgument();
				1753	return NULL;
				1754	}
				1755	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1756	PyUnicode_GET_SIZE(unicode),
				1757	NULL);
				1758	}
				1759
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1760	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1761
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1762	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1763
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1764	PyObject PyUnicode_DecodeMBCS(const char s,
				1765	int size,
				1766	const char *errors)
				1767	{
				1768	PyUnicodeObject *v;
				1769	Py_UNICODE *p;
				1770
				1771	/* First get the size of the result */
				1772	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1773	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1774	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1775
				1776	v = _PyUnicode_New(usize);
				1777	if (v == NULL)
				1778	return NULL;
				1779	if (usize == 0)
				1780	return (PyObject *)v;
				1781	p = PyUnicode_AS_UNICODE(v);
				1782	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1783	Py_DECREF(v);
				1784	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1785	}
				1786
				1787	return (PyObject *)v;
				1788	}
				1789
				1790	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1791	int size,
				1792	const char *errors)
				1793	{
				1794	PyObject *repr;
				1795	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1796	DWORD mbcssize;
				1797
				1798	/* If there are no characters, bail now! */
				1799	if (size==0)
				1800	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1801
				1802	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1803	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1804	if (mbcssize==0)
				1805	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1806
				1807	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1808	if (repr == NULL)
				1809	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1810	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1811	return repr;
				1812
				1813	/* Do the conversion */
				1814	s = PyString_AS_STRING(repr);
				1815	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1816	Py_DECREF(repr);
				1817	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1818	}
				1819	return repr;
				1820	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1821
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1822	#endif /* MS_WIN32 */
				1823
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1824	/* --- Character Mapping Codec -------------------------------------------- */
				1825
				1826	static
				1827	int charmap_decoding_error(const char **source,
				1828	Py_UNICODE **dest,
				1829	const char *errors,
				1830	const char *details)
				1831	{
				1832	if ((errors == NULL) \|\|
				1833	(strcmp(errors,"strict") == 0)) {
				1834	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1835	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1836	details);
				1837	return -1;
				1838	}
				1839	else if (strcmp(errors,"ignore") == 0) {
				1840	return 0;
				1841	}
				1842	else if (strcmp(errors,"replace") == 0) {
				1843	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1844	(*dest)++;
				1845	return 0;
				1846	}
				1847	else {
				1848	PyErr_Format(PyExc_ValueError,
				1849	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1850	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1851	errors);
				1852	return -1;
				1853	}
				1854	}
				1855
				1856	PyObject PyUnicode_DecodeCharmap(const char s,
				1857	int size,
				1858	PyObject *mapping,
				1859	const char *errors)
				1860	{
				1861	PyUnicodeObject *v;
				1862	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1863	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1864
				1865	/* Default to Latin-1 */
				1866	if (mapping == NULL)
				1867	return PyUnicode_DecodeLatin1(s, size, errors);
				1868
				1869	v = _PyUnicode_New(size);
				1870	if (v == NULL)
				1871	goto onError;
				1872	if (size == 0)
				1873	return (PyObject *)v;
				1874	p = PyUnicode_AS_UNICODE(v);
				1875	while (size-- > 0) {
				1876	unsigned char ch = *s++;
				1877	PyObject w, x;
				1878
				1879	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1880	w = PyInt_FromLong((long)ch);
				1881	if (w == NULL)
				1882	goto onError;
				1883	x = PyObject_GetItem(mapping, w);
				1884	Py_DECREF(w);
				1885	if (x == NULL) {
				1886	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1887	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1888	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1889	x = Py_None;
				1890	Py_INCREF(x);
				1891	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1892	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1893	}
				1894
				1895	/* Apply mapping */
				1896	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1897	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1898	if (value < 0 \|\| value > 65535) {
				1899	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1900	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1901	Py_DECREF(x);
				1902	goto onError;
				1903	}
				1904	*p++ = (Py_UNICODE)value;
				1905	}
				1906	else if (x == Py_None) {
				1907	/* undefined mapping */
				1908	if (charmap_decoding_error(&s, &p, errors,
				1909	"character maps to <undefined>")) {
				1910	Py_DECREF(x);
				1911	goto onError;
				1912	}
				1913	}
				1914	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1915	int targetsize = PyUnicode_GET_SIZE(x);
				1916
				1917	if (targetsize == 1)
				1918	/* 1-1 mapping */
				1919	p++ = PyUnicode_AS_UNICODE(x);
				1920
				1921	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1922	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1923	if (targetsize > extrachars) {
				1924	/* resize first */
				1925	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				1926	int needed = (targetsize - extrachars) + \
				1927	(targetsize << 2);
				1928	extrachars += needed;
				1929	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1930	Py_DECREF(x);
				1931	goto onError;
				1932	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1933	p = PyUnicode_AS_UNICODE(v) + oldpos;
				1934	}
				1935	Py_UNICODE_COPY(p,
				1936	PyUnicode_AS_UNICODE(x),
				1937	targetsize);
				1938	p += targetsize;
				1939	extrachars -= targetsize;
				1940	}
				1941	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1942	}
				1943	else {
				1944	/* wrong return value */
				1945	PyErr_SetString(PyExc_TypeError,
				1946	"character mapping must return integer, None or unicode");
				1947	Py_DECREF(x);
				1948	goto onError;
				1949	}
				1950	Py_DECREF(x);
				1951	}
				1952	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1953	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1954	goto onError;
				1955	return (PyObject *)v;
				1956
				1957	onError:
				1958	Py_XDECREF(v);
				1959	return NULL;
				1960	}
				1961
				1962	static
				1963	int charmap_encoding_error(const Py_UNICODE **source,
				1964	char **dest,
				1965	const char *errors,
				1966	const char *details)
				1967	{
				1968	if ((errors == NULL) \|\|
				1969	(strcmp(errors,"strict") == 0)) {
				1970	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1971	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1972	details);
				1973	return -1;
				1974	}
				1975	else if (strcmp(errors,"ignore") == 0) {
				1976	return 0;
				1977	}
				1978	else if (strcmp(errors,"replace") == 0) {
				1979	**dest = '?';
				1980	(*dest)++;
				1981	return 0;
				1982	}
				1983	else {
				1984	PyErr_Format(PyExc_ValueError,
				1985	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1986	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1987	errors);
				1988	return -1;
				1989	}
				1990	}
				1991
				1992	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1993	int size,
				1994	PyObject *mapping,
				1995	const char *errors)
				1996	{
				1997	PyObject *v;
				1998	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1999	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2000
				2001	/* Default to Latin-1 */
				2002	if (mapping == NULL)
				2003	return PyUnicode_EncodeLatin1(p, size, errors);
				2004
				2005	v = PyString_FromStringAndSize(NULL, size);
				2006	if (v == NULL)
				2007	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2008	if (size == 0)
				2009	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2010	s = PyString_AS_STRING(v);
				2011	while (size-- > 0) {
				2012	Py_UNICODE ch = *p++;
				2013	PyObject w, x;
				2014
				2015	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2016	w = PyInt_FromLong((long)ch);
				2017	if (w == NULL)
				2018	goto onError;
				2019	x = PyObject_GetItem(mapping, w);
				2020	Py_DECREF(w);
				2021	if (x == NULL) {
				2022	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2023	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2024	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2025	x = Py_None;
				2026	Py_INCREF(x);
				2027	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2028	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2029	}
				2030
				2031	/* Apply mapping */
				2032	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2033	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2034	if (value < 0 \|\| value > 255) {
				2035	PyErr_SetString(PyExc_TypeError,
				2036	"character mapping must be in range(256)");
				2037	Py_DECREF(x);
				2038	goto onError;
				2039	}
				2040	*s++ = (char)value;
				2041	}
				2042	else if (x == Py_None) {
				2043	/* undefined mapping */
				2044	if (charmap_encoding_error(&p, &s, errors,
				2045	"character maps to <undefined>")) {
				2046	Py_DECREF(x);
				2047	goto onError;
				2048	}
				2049	}
				2050	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2051	int targetsize = PyString_GET_SIZE(x);
				2052
				2053	if (targetsize == 1)
				2054	/* 1-1 mapping */
				2055	s++ = PyString_AS_STRING(x);
				2056
				2057	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2058	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2059	if (targetsize > extrachars) {
				2060	/* resize first */
				2061	int oldpos = (int)(s - PyString_AS_STRING(v));
				2062	int needed = (targetsize - extrachars) + \
				2063	(targetsize << 2);
				2064	extrachars += needed;
				2065	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2066	Py_DECREF(x);
				2067	goto onError;
				2068	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2069	s = PyString_AS_STRING(v) + oldpos;
				2070	}
				2071	memcpy(s,
				2072	PyString_AS_STRING(x),
				2073	targetsize);
				2074	s += targetsize;
				2075	extrachars -= targetsize;
				2076	}
				2077	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2078	}
				2079	else {
				2080	/* wrong return value */
				2081	PyErr_SetString(PyExc_TypeError,
				2082	"character mapping must return integer, None or unicode");
				2083	Py_DECREF(x);
				2084	goto onError;
				2085	}
				2086	Py_DECREF(x);
				2087	}
				2088	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2089	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2090	goto onError;
				2091	return v;
				2092
				2093	onError:
				2094	Py_DECREF(v);
				2095	return NULL;
				2096	}
				2097
				2098	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2099	PyObject *mapping)
				2100	{
				2101	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2102	PyErr_BadArgument();
				2103	return NULL;
				2104	}
				2105	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2106	PyUnicode_GET_SIZE(unicode),
				2107	mapping,
				2108	NULL);
				2109	}
				2110
				2111	static
				2112	int translate_error(const Py_UNICODE **source,
				2113	Py_UNICODE **dest,
				2114	const char *errors,
				2115	const char *details)
				2116	{
				2117	if ((errors == NULL) \|\|
				2118	(strcmp(errors,"strict") == 0)) {
				2119	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2120	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2121	details);
				2122	return -1;
				2123	}
				2124	else if (strcmp(errors,"ignore") == 0) {
				2125	return 0;
				2126	}
				2127	else if (strcmp(errors,"replace") == 0) {
				2128	**dest = '?';
				2129	(*dest)++;
				2130	return 0;
				2131	}
				2132	else {
				2133	PyErr_Format(PyExc_ValueError,
				2134	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2135	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2136	errors);
				2137	return -1;
				2138	}
				2139	}
				2140
				2141	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2142	int size,
				2143	PyObject *mapping,
				2144	const char *errors)
				2145	{
				2146	PyUnicodeObject *v;
				2147	Py_UNICODE *p;
				2148
				2149	if (mapping == NULL) {
				2150	PyErr_BadArgument();
				2151	return NULL;
				2152	}
				2153
				2154	/* Output will never be longer than input */
				2155	v = _PyUnicode_New(size);
				2156	if (v == NULL)
				2157	goto onError;
				2158	if (size == 0)
				2159	goto done;
				2160	p = PyUnicode_AS_UNICODE(v);
				2161	while (size-- > 0) {
				2162	Py_UNICODE ch = *s++;
				2163	PyObject w, x;
				2164
				2165	/* Get mapping */
				2166	w = PyInt_FromLong(ch);
				2167	if (w == NULL)
				2168	goto onError;
				2169	x = PyObject_GetItem(mapping, w);
				2170	Py_DECREF(w);
				2171	if (x == NULL) {
				2172	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2173	/* No mapping found: default to 1-1 mapping */
				2174	PyErr_Clear();
				2175	*p++ = ch;
				2176	continue;
				2177	}
				2178	goto onError;
				2179	}
				2180
				2181	/* Apply mapping */
				2182	if (PyInt_Check(x))
				2183	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2184	else if (x == Py_None) {
				2185	/* undefined mapping */
				2186	if (translate_error(&s, &p, errors,
				2187	"character maps to <undefined>")) {
				2188	Py_DECREF(x);
				2189	goto onError;
				2190	}
				2191	}
				2192	else if (PyUnicode_Check(x)) {
				2193	if (PyUnicode_GET_SIZE(x) != 1) {
				2194	/* 1-n mapping */
				2195	PyErr_SetString(PyExc_NotImplementedError,
				2196	"1-n mappings are currently not implemented");
				2197	Py_DECREF(x);
				2198	goto onError;
				2199	}
				2200	p++ = PyUnicode_AS_UNICODE(x);
				2201	}
				2202	else {
				2203	/* wrong return value */
				2204	PyErr_SetString(PyExc_TypeError,
				2205	"translate mapping must return integer, None or unicode");
				2206	Py_DECREF(x);
				2207	goto onError;
				2208	}
				2209	Py_DECREF(x);
				2210	}
				2211	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2212	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2213	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2214
				2215	done:
				2216	return (PyObject *)v;
				2217
				2218	onError:
				2219	Py_XDECREF(v);
				2220	return NULL;
				2221	}
				2222
				2223	PyObject PyUnicode_Translate(PyObject str,
				2224	PyObject *mapping,
				2225	const char *errors)
				2226	{
				2227	PyObject *result;
				2228
				2229	str = PyUnicode_FromObject(str);
				2230	if (str == NULL)
				2231	goto onError;
				2232	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2233	PyUnicode_GET_SIZE(str),
				2234	mapping,
				2235	errors);
				2236	Py_DECREF(str);
				2237	return result;
				2238
				2239	onError:
				2240	Py_XDECREF(str);
				2241	return NULL;
				2242	}
				2243
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2244	/* --- Decimal Encoder ---------------------------------------------------- */
				2245
				2246	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2247	int length,
				2248	char *output,
				2249	const char *errors)
				2250	{
				2251	Py_UNICODE p, end;
				2252
				2253	if (output == NULL) {
				2254	PyErr_BadArgument();
				2255	return -1;
				2256	}
				2257
				2258	p = s;
				2259	end = s + length;
				2260	while (p < end) {
				2261	register Py_UNICODE ch = *p++;
				2262	int decimal;
				2263
				2264	if (Py_UNICODE_ISSPACE(ch)) {
				2265	*output++ = ' ';
				2266	continue;
				2267	}
				2268	decimal = Py_UNICODE_TODECIMAL(ch);
				2269	if (decimal >= 0) {
				2270	*output++ = '0' + decimal;
				2271	continue;
				2272	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2273	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2274	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2275	continue;
				2276	}
				2277	/* All other characters are considered invalid */
				2278	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2279	PyErr_SetString(PyExc_ValueError,
				2280	"invalid decimal Unicode string");
				2281	goto onError;
				2282	}
				2283	else if (strcmp(errors, "ignore") == 0)
				2284	continue;
				2285	else if (strcmp(errors, "replace") == 0) {
				2286	*output++ = '?';
				2287	continue;
				2288	}
				2289	}
				2290	/* 0-terminate the output string */
				2291	*output++ = '\0';
				2292	return 0;
				2293
				2294	onError:
				2295	return -1;
				2296	}
				2297
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2298	/* --- Helpers ------------------------------------------------------------ */
				2299
				2300	static
				2301	int count(PyUnicodeObject *self,
				2302	int start,
				2303	int end,
				2304	PyUnicodeObject *substring)
				2305	{
				2306	int count = 0;
				2307
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2308	if (start < 0)
				2309	start += self->length;
				2310	if (start < 0)
				2311	start = 0;
				2312	if (end > self->length)
				2313	end = self->length;
				2314	if (end < 0)
				2315	end += self->length;
				2316	if (end < 0)
				2317	end = 0;
				2318
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2319	if (substring->length == 0)
				2320	return (end - start + 1);
				2321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2322	end -= substring->length;
				2323
				2324	while (start <= end)
				2325	if (Py_UNICODE_MATCH(self, start, substring)) {
				2326	count++;
				2327	start += substring->length;
				2328	} else
				2329	start++;
				2330
				2331	return count;
				2332	}
				2333
				2334	int PyUnicode_Count(PyObject *str,
				2335	PyObject *substr,
				2336	int start,
				2337	int end)
				2338	{
				2339	int result;
				2340
				2341	str = PyUnicode_FromObject(str);
				2342	if (str == NULL)
				2343	return -1;
				2344	substr = PyUnicode_FromObject(substr);
				2345	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2346	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2347	return -1;
				2348	}
				2349
				2350	result = count((PyUnicodeObject *)str,
				2351	start, end,
				2352	(PyUnicodeObject *)substr);
				2353
				2354	Py_DECREF(str);
				2355	Py_DECREF(substr);
				2356	return result;
				2357	}
				2358
				2359	static
				2360	int findstring(PyUnicodeObject *self,
				2361	PyUnicodeObject *substring,
				2362	int start,
				2363	int end,
				2364	int direction)
				2365	{
				2366	if (start < 0)
				2367	start += self->length;
				2368	if (start < 0)
				2369	start = 0;
				2370
				2371	if (substring->length == 0)
				2372	return start;
				2373
				2374	if (end > self->length)
				2375	end = self->length;
				2376	if (end < 0)
				2377	end += self->length;
				2378	if (end < 0)
				2379	end = 0;
				2380
				2381	end -= substring->length;
				2382
				2383	if (direction < 0) {
				2384	for (; end >= start; end--)
				2385	if (Py_UNICODE_MATCH(self, end, substring))
				2386	return end;
				2387	} else {
				2388	for (; start <= end; start++)
				2389	if (Py_UNICODE_MATCH(self, start, substring))
				2390	return start;
				2391	}
				2392
				2393	return -1;
				2394	}
				2395
				2396	int PyUnicode_Find(PyObject *str,
				2397	PyObject *substr,
				2398	int start,
				2399	int end,
				2400	int direction)
				2401	{
				2402	int result;
				2403
				2404	str = PyUnicode_FromObject(str);
				2405	if (str == NULL)
				2406	return -1;
				2407	substr = PyUnicode_FromObject(substr);
				2408	if (substr == NULL) {
				2409	Py_DECREF(substr);
				2410	return -1;
				2411	}
				2412
				2413	result = findstring((PyUnicodeObject *)str,
				2414	(PyUnicodeObject *)substr,
				2415	start, end, direction);
				2416	Py_DECREF(str);
				2417	Py_DECREF(substr);
				2418	return result;
				2419	}
				2420
				2421	static
				2422	int tailmatch(PyUnicodeObject *self,
				2423	PyUnicodeObject *substring,
				2424	int start,
				2425	int end,
				2426	int direction)
				2427	{
				2428	if (start < 0)
				2429	start += self->length;
				2430	if (start < 0)
				2431	start = 0;
				2432
				2433	if (substring->length == 0)
				2434	return 1;
				2435
				2436	if (end > self->length)
				2437	end = self->length;
				2438	if (end < 0)
				2439	end += self->length;
				2440	if (end < 0)
				2441	end = 0;
				2442
				2443	end -= substring->length;
				2444	if (end < start)
				2445	return 0;
				2446
				2447	if (direction > 0) {
				2448	if (Py_UNICODE_MATCH(self, end, substring))
				2449	return 1;
				2450	} else {
				2451	if (Py_UNICODE_MATCH(self, start, substring))
				2452	return 1;
				2453	}
				2454
				2455	return 0;
				2456	}
				2457
				2458	int PyUnicode_Tailmatch(PyObject *str,
				2459	PyObject *substr,
				2460	int start,
				2461	int end,
				2462	int direction)
				2463	{
				2464	int result;
				2465
				2466	str = PyUnicode_FromObject(str);
				2467	if (str == NULL)
				2468	return -1;
				2469	substr = PyUnicode_FromObject(substr);
				2470	if (substr == NULL) {
				2471	Py_DECREF(substr);
				2472	return -1;
				2473	}
				2474
				2475	result = tailmatch((PyUnicodeObject *)str,
				2476	(PyUnicodeObject *)substr,
				2477	start, end, direction);
				2478	Py_DECREF(str);
				2479	Py_DECREF(substr);
				2480	return result;
				2481	}
				2482
				2483	static
				2484	const Py_UNICODE findchar(const Py_UNICODE s,
				2485	int size,
				2486	Py_UNICODE ch)
				2487	{
				2488	/* like wcschr, but doesn't stop at NULL characters */
				2489
				2490	while (size-- > 0) {
				2491	if (*s == ch)
				2492	return s;
				2493	s++;
				2494	}
				2495
				2496	return NULL;
				2497	}
				2498
				2499	/* Apply fixfct filter to the Unicode object self and return a
				2500	reference to the modified object */
				2501
				2502	static
				2503	PyObject fixup(PyUnicodeObject self,
				2504	int (fixfct)(PyUnicodeObject s))
				2505	{
				2506
				2507	PyUnicodeObject *u;
				2508
				2509	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2510	self->length);
				2511	if (u == NULL)
				2512	return NULL;
				2513	if (!fixfct(u)) {
				2514	/* fixfct should return TRUE if it modified the buffer. If
				2515	FALSE, return a reference to the original buffer instead
				2516	(to save space, not time) */
				2517	Py_INCREF(self);
				2518	Py_DECREF(u);
				2519	return (PyObject*) self;
				2520	}
				2521	return (PyObject*) u;
				2522	}
				2523
				2524	static
				2525	int fixupper(PyUnicodeObject *self)
				2526	{
				2527	int len = self->length;
				2528	Py_UNICODE *s = self->str;
				2529	int status = 0;
				2530
				2531	while (len-- > 0) {
				2532	register Py_UNICODE ch;
				2533
				2534	ch = Py_UNICODE_TOUPPER(*s);
				2535	if (ch != *s) {
				2536	status = 1;
				2537	*s = ch;
				2538	}
				2539	s++;
				2540	}
				2541
				2542	return status;
				2543	}
				2544
				2545	static
				2546	int fixlower(PyUnicodeObject *self)
				2547	{
				2548	int len = self->length;
				2549	Py_UNICODE *s = self->str;
				2550	int status = 0;
				2551
				2552	while (len-- > 0) {
				2553	register Py_UNICODE ch;
				2554
				2555	ch = Py_UNICODE_TOLOWER(*s);
				2556	if (ch != *s) {
				2557	status = 1;
				2558	*s = ch;
				2559	}
				2560	s++;
				2561	}
				2562
				2563	return status;
				2564	}
				2565
				2566	static
				2567	int fixswapcase(PyUnicodeObject *self)
				2568	{
				2569	int len = self->length;
				2570	Py_UNICODE *s = self->str;
				2571	int status = 0;
				2572
				2573	while (len-- > 0) {
				2574	if (Py_UNICODE_ISUPPER(*s)) {
				2575	s = Py_UNICODE_TOLOWER(s);
				2576	status = 1;
				2577	} else if (Py_UNICODE_ISLOWER(*s)) {
				2578	s = Py_UNICODE_TOUPPER(s);
				2579	status = 1;
				2580	}
				2581	s++;
				2582	}
				2583
				2584	return status;
				2585	}
				2586
				2587	static
				2588	int fixcapitalize(PyUnicodeObject *self)
				2589	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2590	int len = self->length;
				2591	Py_UNICODE *s = self->str;
				2592	int status = 0;
				2593
				2594	if (len == 0)
				2595	return 0;
				2596	if (Py_UNICODE_ISLOWER(*s)) {
				2597	s = Py_UNICODE_TOUPPER(s);
				2598	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2599	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2600	s++;
				2601	while (--len > 0) {
				2602	if (Py_UNICODE_ISUPPER(*s)) {
				2603	s = Py_UNICODE_TOLOWER(s);
				2604	status = 1;
				2605	}
				2606	s++;
				2607	}
				2608	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2609	}
				2610
				2611	static
				2612	int fixtitle(PyUnicodeObject *self)
				2613	{
				2614	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2615	register Py_UNICODE *e;
				2616	int previous_is_cased;
				2617
				2618	/* Shortcut for single character strings */
				2619	if (PyUnicode_GET_SIZE(self) == 1) {
				2620	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2621	if (*p != ch) {
				2622	*p = ch;
				2623	return 1;
				2624	}
				2625	else
				2626	return 0;
				2627	}
				2628
				2629	e = p + PyUnicode_GET_SIZE(self);
				2630	previous_is_cased = 0;
				2631	for (; p < e; p++) {
				2632	register const Py_UNICODE ch = *p;
				2633
				2634	if (previous_is_cased)
				2635	*p = Py_UNICODE_TOLOWER(ch);
				2636	else
				2637	*p = Py_UNICODE_TOTITLE(ch);
				2638
				2639	if (Py_UNICODE_ISLOWER(ch) \|\|
				2640	Py_UNICODE_ISUPPER(ch) \|\|
				2641	Py_UNICODE_ISTITLE(ch))
				2642	previous_is_cased = 1;
				2643	else
				2644	previous_is_cased = 0;
				2645	}
				2646	return 1;
				2647	}
				2648
				2649	PyObject PyUnicode_Join(PyObject separator,
				2650	PyObject *seq)
				2651	{
				2652	Py_UNICODE *sep;
				2653	int seplen;
				2654	PyUnicodeObject *res = NULL;
				2655	int reslen = 0;
				2656	Py_UNICODE *p;
				2657	int seqlen = 0;
				2658	int sz = 100;
				2659	int i;
				2660
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2661	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2662	if (seqlen < 0 && PyErr_Occurred())
				2663	return NULL;
				2664
				2665	if (separator == NULL) {
				2666	Py_UNICODE blank = ' ';
				2667	sep = &blank;
				2668	seplen = 1;
				2669	}
				2670	else {
				2671	separator = PyUnicode_FromObject(separator);
				2672	if (separator == NULL)
				2673	return NULL;
				2674	sep = PyUnicode_AS_UNICODE(separator);
				2675	seplen = PyUnicode_GET_SIZE(separator);
				2676	}
				2677
				2678	res = _PyUnicode_New(sz);
				2679	if (res == NULL)
				2680	goto onError;
				2681	p = PyUnicode_AS_UNICODE(res);
				2682	reslen = 0;
				2683
				2684	for (i = 0; i < seqlen; i++) {
				2685	int itemlen;
				2686	PyObject *item;
				2687
				2688	item = PySequence_GetItem(seq, i);
				2689	if (item == NULL)
				2690	goto onError;
				2691	if (!PyUnicode_Check(item)) {
				2692	PyObject *v;
				2693	v = PyUnicode_FromObject(item);
				2694	Py_DECREF(item);
				2695	item = v;
				2696	if (item == NULL)
				2697	goto onError;
				2698	}
				2699	itemlen = PyUnicode_GET_SIZE(item);
				2700	while (reslen + itemlen + seplen >= sz) {
				2701	if (_PyUnicode_Resize(res, sz*2))
				2702	goto onError;
				2703	sz *= 2;
				2704	p = PyUnicode_AS_UNICODE(res) + reslen;
				2705	}
				2706	if (i > 0) {
				2707	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2708	p += seplen;
				2709	reslen += seplen;
				2710	}
				2711	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2712	p += itemlen;
				2713	reslen += itemlen;
				2714	Py_DECREF(item);
				2715	}
				2716	if (_PyUnicode_Resize(res, reslen))
				2717	goto onError;
				2718
				2719	Py_XDECREF(separator);
				2720	return (PyObject *)res;
				2721
				2722	onError:
				2723	Py_XDECREF(separator);
				2724	Py_DECREF(res);
				2725	return NULL;
				2726	}
				2727
				2728	static
				2729	PyUnicodeObject pad(PyUnicodeObject self,
				2730	int left,
				2731	int right,
				2732	Py_UNICODE fill)
				2733	{
				2734	PyUnicodeObject *u;
				2735
				2736	if (left < 0)
				2737	left = 0;
				2738	if (right < 0)
				2739	right = 0;
				2740
				2741	if (left == 0 && right == 0) {
				2742	Py_INCREF(self);
				2743	return self;
				2744	}
				2745
				2746	u = _PyUnicode_New(left + self->length + right);
				2747	if (u) {
				2748	if (left)
				2749	Py_UNICODE_FILL(u->str, fill, left);
				2750	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2751	if (right)
				2752	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2753	}
				2754
				2755	return u;
				2756	}
				2757
				2758	#define SPLIT_APPEND(data, left, right) \
				2759	str = PyUnicode_FromUnicode(data + left, right - left); \
				2760	if (!str) \
				2761	goto onError; \
				2762	if (PyList_Append(list, str)) { \
				2763	Py_DECREF(str); \
				2764	goto onError; \
				2765	} \
				2766	else \
				2767	Py_DECREF(str);
				2768
				2769	static
				2770	PyObject split_whitespace(PyUnicodeObject self,
				2771	PyObject *list,
				2772	int maxcount)
				2773	{
				2774	register int i;
				2775	register int j;
				2776	int len = self->length;
				2777	PyObject *str;
				2778
				2779	for (i = j = 0; i < len; ) {
				2780	/* find a token */
				2781	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2782	i++;
				2783	j = i;
				2784	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2785	i++;
				2786	if (j < i) {
				2787	if (maxcount-- <= 0)
				2788	break;
				2789	SPLIT_APPEND(self->str, j, i);
				2790	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2791	i++;
				2792	j = i;
				2793	}
				2794	}
				2795	if (j < len) {
				2796	SPLIT_APPEND(self->str, j, len);
				2797	}
				2798	return list;
				2799
				2800	onError:
				2801	Py_DECREF(list);
				2802	return NULL;
				2803	}
				2804
				2805	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2806	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2807	{
				2808	register int i;
				2809	register int j;
				2810	int len;
				2811	PyObject *list;
				2812	PyObject *str;
				2813	Py_UNICODE *data;
				2814
				2815	string = PyUnicode_FromObject(string);
				2816	if (string == NULL)
				2817	return NULL;
				2818	data = PyUnicode_AS_UNICODE(string);
				2819	len = PyUnicode_GET_SIZE(string);
				2820
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2821	list = PyList_New(0);
				2822	if (!list)
				2823	goto onError;
				2824
				2825	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2826	int eol;
				2827
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2828	/* Find a line and append it */
				2829	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2830	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2831
				2832	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2833	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2834	if (i < len) {
				2835	if (data[i] == '\r' && i + 1 < len &&
				2836	data[i+1] == '\n')
				2837	i += 2;
				2838	else
				2839	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2840	if (keepends)
				2841	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2842	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2843	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2844	j = i;
				2845	}
				2846	if (j < len) {
				2847	SPLIT_APPEND(data, j, len);
				2848	}
				2849
				2850	Py_DECREF(string);
				2851	return list;
				2852
				2853	onError:
				2854	Py_DECREF(list);
				2855	Py_DECREF(string);
				2856	return NULL;
				2857	}
				2858
				2859	static
				2860	PyObject split_char(PyUnicodeObject self,
				2861	PyObject *list,
				2862	Py_UNICODE ch,
				2863	int maxcount)
				2864	{
				2865	register int i;
				2866	register int j;
				2867	int len = self->length;
				2868	PyObject *str;
				2869
				2870	for (i = j = 0; i < len; ) {
				2871	if (self->str[i] == ch) {
				2872	if (maxcount-- <= 0)
				2873	break;
				2874	SPLIT_APPEND(self->str, j, i);
				2875	i = j = i + 1;
				2876	} else
				2877	i++;
				2878	}
				2879	if (j <= len) {
				2880	SPLIT_APPEND(self->str, j, len);
				2881	}
				2882	return list;
				2883
				2884	onError:
				2885	Py_DECREF(list);
				2886	return NULL;
				2887	}
				2888
				2889	static
				2890	PyObject split_substring(PyUnicodeObject self,
				2891	PyObject *list,
				2892	PyUnicodeObject *substring,
				2893	int maxcount)
				2894	{
				2895	register int i;
				2896	register int j;
				2897	int len = self->length;
				2898	int sublen = substring->length;
				2899	PyObject *str;
				2900
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2901	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2902	if (Py_UNICODE_MATCH(self, i, substring)) {
				2903	if (maxcount-- <= 0)
				2904	break;
				2905	SPLIT_APPEND(self->str, j, i);
				2906	i = j = i + sublen;
				2907	} else
				2908	i++;
				2909	}
				2910	if (j <= len) {
				2911	SPLIT_APPEND(self->str, j, len);
				2912	}
				2913	return list;
				2914
				2915	onError:
				2916	Py_DECREF(list);
				2917	return NULL;
				2918	}
				2919
				2920	#undef SPLIT_APPEND
				2921
				2922	static
				2923	PyObject split(PyUnicodeObject self,
				2924	PyUnicodeObject *substring,
				2925	int maxcount)
				2926	{
				2927	PyObject *list;
				2928
				2929	if (maxcount < 0)
				2930	maxcount = INT_MAX;
				2931
				2932	list = PyList_New(0);
				2933	if (!list)
				2934	return NULL;
				2935
				2936	if (substring == NULL)
				2937	return split_whitespace(self,list,maxcount);
				2938
				2939	else if (substring->length == 1)
				2940	return split_char(self,list,substring->str[0],maxcount);
				2941
				2942	else if (substring->length == 0) {
				2943	Py_DECREF(list);
				2944	PyErr_SetString(PyExc_ValueError, "empty separator");
				2945	return NULL;
				2946	}
				2947	else
				2948	return split_substring(self,list,substring,maxcount);
				2949	}
				2950
				2951	static
				2952	PyObject strip(PyUnicodeObject self,
				2953	int left,
				2954	int right)
				2955	{
				2956	Py_UNICODE *p = self->str;
				2957	int start = 0;
				2958	int end = self->length;
				2959
				2960	if (left)
				2961	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2962	start++;
				2963
				2964	if (right)
				2965	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2966	end--;
				2967
				2968	if (start == 0 && end == self->length) {
				2969	/* couldn't strip anything off, return original string */
				2970	Py_INCREF(self);
				2971	return (PyObject*) self;
				2972	}
				2973
				2974	return (PyObject*) PyUnicode_FromUnicode(
				2975	self->str + start,
				2976	end - start
				2977	);
				2978	}
				2979
				2980	static
				2981	PyObject replace(PyUnicodeObject self,
				2982	PyUnicodeObject *str1,
				2983	PyUnicodeObject *str2,
				2984	int maxcount)
				2985	{
				2986	PyUnicodeObject *u;
				2987
				2988	if (maxcount < 0)
				2989	maxcount = INT_MAX;
				2990
				2991	if (str1->length == 1 && str2->length == 1) {
				2992	int i;
				2993
				2994	/* replace characters */
				2995	if (!findchar(self->str, self->length, str1->str[0])) {
				2996	/* nothing to replace, return original string */
				2997	Py_INCREF(self);
				2998	u = self;
				2999	} else {
				3000	Py_UNICODE u1 = str1->str[0];
				3001	Py_UNICODE u2 = str2->str[0];
				3002
				3003	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3004	self->str,
				3005	self->length
				3006	);
				3007	if (u)
				3008	for (i = 0; i < u->length; i++)
				3009	if (u->str[i] == u1) {
				3010	if (--maxcount < 0)
				3011	break;
				3012	u->str[i] = u2;
				3013	}
				3014	}
				3015
				3016	} else {
				3017	int n, i;
				3018	Py_UNICODE *p;
				3019
				3020	/* replace strings */
				3021	n = count(self, 0, self->length, str1);
				3022	if (n > maxcount)
				3023	n = maxcount;
				3024	if (n == 0) {
				3025	/* nothing to replace, return original string */
				3026	Py_INCREF(self);
				3027	u = self;
				3028	} else {
				3029	u = _PyUnicode_New(
				3030	self->length + n * (str2->length - str1->length));
				3031	if (u) {
				3032	i = 0;
				3033	p = u->str;
				3034	while (i <= self->length - str1->length)
				3035	if (Py_UNICODE_MATCH(self, i, str1)) {
				3036	/* replace string segment */
				3037	Py_UNICODE_COPY(p, str2->str, str2->length);
				3038	p += str2->length;
				3039	i += str1->length;
				3040	if (--n <= 0) {
				3041	/* copy remaining part */
				3042	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3043	break;
				3044	}
				3045	} else
				3046	*p++ = self->str[i++];
				3047	}
				3048	}
				3049	}
				3050
				3051	return (PyObject *) u;
				3052	}
				3053
				3054	/* --- Unicode Object Methods --------------------------------------------- */
				3055
				3056	static char title__doc__[] =
				3057	"S.title() -> unicode\n\
				3058	\n\
				3059	Return a titlecased version of S, i.e. words start with title case\n\
				3060	characters, all remaining cased characters have lower case.";
				3061
				3062	static PyObject*
				3063	unicode_title(PyUnicodeObject self, PyObject args)
				3064	{
				3065	if (!PyArg_NoArgs(args))
				3066	return NULL;
				3067	return fixup(self, fixtitle);
				3068	}
				3069
				3070	static char capitalize__doc__[] =
				3071	"S.capitalize() -> unicode\n\
				3072	\n\
				3073	Return a capitalized version of S, i.e. make the first character\n\
				3074	have upper case.";
				3075
				3076	static PyObject*
				3077	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3078	{
				3079	if (!PyArg_NoArgs(args))
				3080	return NULL;
				3081	return fixup(self, fixcapitalize);
				3082	}
				3083
				3084	#if 0
				3085	static char capwords__doc__[] =
				3086	"S.capwords() -> unicode\n\
				3087	\n\
				3088	Apply .capitalize() to all words in S and return the result with\n\
				3089	normalized whitespace (all whitespace strings are replaced by ' ').";
				3090
				3091	static PyObject*
				3092	unicode_capwords(PyUnicodeObject self, PyObject args)
				3093	{
				3094	PyObject *list;
				3095	PyObject *item;
				3096	int i;
				3097
				3098	if (!PyArg_NoArgs(args))
				3099	return NULL;
				3100
				3101	/* Split into words */
				3102	list = split(self, NULL, -1);
				3103	if (!list)
				3104	return NULL;
				3105
				3106	/* Capitalize each word */
				3107	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3108	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3109	fixcapitalize);
				3110	if (item == NULL)
				3111	goto onError;
				3112	Py_DECREF(PyList_GET_ITEM(list, i));
				3113	PyList_SET_ITEM(list, i, item);
				3114	}
				3115
				3116	/* Join the words to form a new string */
				3117	item = PyUnicode_Join(NULL, list);
				3118
				3119	onError:
				3120	Py_DECREF(list);
				3121	return (PyObject *)item;
				3122	}
				3123	#endif
				3124
				3125	static char center__doc__[] =
				3126	"S.center(width) -> unicode\n\
				3127	\n\
				3128	Return S centered in a Unicode string of length width. Padding is done\n\
				3129	using spaces.";
				3130
				3131	static PyObject *
				3132	unicode_center(PyUnicodeObject self, PyObject args)
				3133	{
				3134	int marg, left;
				3135	int width;
				3136
				3137	if (!PyArg_ParseTuple(args, "i:center", &width))
				3138	return NULL;
				3139
				3140	if (self->length >= width) {
				3141	Py_INCREF(self);
				3142	return (PyObject*) self;
				3143	}
				3144
				3145	marg = width - self->length;
				3146	left = marg / 2 + (marg & width & 1);
				3147
				3148	return (PyObject*) pad(self, left, marg - left, ' ');
				3149	}
				3150
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3151	#if 0
				3152
				3153	/* This code should go into some future Unicode collation support
				3154	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3155	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3156
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3157	/* speedy UTF-16 code point order comparison */
				3158	/* gleaned from: */
				3159	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3160
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3161	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3162	{
				3163	0, 0, 0, 0, 0, 0, 0, 0,
				3164	0, 0, 0, 0, 0, 0, 0, 0,
				3165	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3166	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3167	};
				3168
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3169	static int
				3170	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3171	{
				3172	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3173
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3174	Py_UNICODE *s1 = str1->str;
				3175	Py_UNICODE *s2 = str2->str;
				3176
				3177	len1 = str1->length;
				3178	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3179
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3180	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3181	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3182	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3183
				3184	c1 = *s1++;
				3185	c2 = *s2++;
				3186	if (c1 > (1<<11) * 26)
				3187	c1 += utf16Fixup[c1>>11];
				3188	if (c2 > (1<<11) * 26)
				3189	c2 += utf16Fixup[c2>>11];
				3190
				3191	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3192	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3193	if (diff)
				3194	return (diff < 0) ? -1 : (diff != 0);
				3195	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3196	}
				3197
				3198	return (len1 < len2) ? -1 : (len1 != len2);
				3199	}
				3200
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3201	#else
				3202
				3203	static int
				3204	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3205	{
				3206	register int len1, len2;
				3207
				3208	Py_UNICODE *s1 = str1->str;
				3209	Py_UNICODE *s2 = str2->str;
				3210
				3211	len1 = str1->length;
				3212	len2 = str2->length;
				3213
				3214	while (len1 > 0 && len2 > 0) {
				3215	register long diff;
				3216
				3217	diff = (long)s1++ - (long)s2++;
				3218	if (diff)
				3219	return (diff < 0) ? -1 : (diff != 0);
				3220	len1--; len2--;
				3221	}
				3222
				3223	return (len1 < len2) ? -1 : (len1 != len2);
				3224	}
				3225
				3226	#endif
				3227
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3228	int PyUnicode_Compare(PyObject *left,
				3229	PyObject *right)
				3230	{
				3231	PyUnicodeObject u = NULL, v = NULL;
				3232	int result;
				3233
				3234	/* Coerce the two arguments */
				3235	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3236	if (u == NULL)
				3237	goto onError;
				3238	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3239	if (v == NULL)
				3240	goto onError;
				3241
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3242	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3243	if (v == u) {
				3244	Py_DECREF(u);
				3245	Py_DECREF(v);
				3246	return 0;
				3247	}
				3248
				3249	result = unicode_compare(u, v);
				3250
				3251	Py_DECREF(u);
				3252	Py_DECREF(v);
				3253	return result;
				3254
				3255	onError:
				3256	Py_XDECREF(u);
				3257	Py_XDECREF(v);
				3258	return -1;
				3259	}
				3260
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3261	int PyUnicode_Contains(PyObject *container,
				3262	PyObject *element)
				3263	{
				3264	PyUnicodeObject u = NULL, v = NULL;
				3265	int result;
				3266	register const Py_UNICODE p, e;
				3267	register Py_UNICODE ch;
				3268
				3269	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3270	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3271	if (v == NULL) {
				3272	PyErr_SetString(PyExc_TypeError,
				3273	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3274	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3275	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3276	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3277	if (u == NULL) {
				3278	Py_DECREF(v);
				3279	goto onError;
				3280	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3281
				3282	/* Check v in u */
				3283	if (PyUnicode_GET_SIZE(v) != 1) {
				3284	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3285	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3286	goto onError;
				3287	}
				3288	ch = *PyUnicode_AS_UNICODE(v);
				3289	p = PyUnicode_AS_UNICODE(u);
				3290	e = p + PyUnicode_GET_SIZE(u);
				3291	result = 0;
				3292	while (p < e) {
				3293	if (*p++ == ch) {
				3294	result = 1;
				3295	break;
				3296	}
				3297	}
				3298
				3299	Py_DECREF(u);
				3300	Py_DECREF(v);
				3301	return result;
				3302
				3303	onError:
				3304	Py_XDECREF(u);
				3305	Py_XDECREF(v);
				3306	return -1;
				3307	}
				3308
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3309	/* Concat to string or Unicode object giving a new Unicode object. */
				3310
				3311	PyObject PyUnicode_Concat(PyObject left,
				3312	PyObject *right)
				3313	{
				3314	PyUnicodeObject u = NULL, v = NULL, *w;
				3315
				3316	/* Coerce the two arguments */
				3317	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3318	if (u == NULL)
				3319	goto onError;
				3320	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3321	if (v == NULL)
				3322	goto onError;
				3323
				3324	/* Shortcuts */
				3325	if (v == unicode_empty) {
				3326	Py_DECREF(v);
				3327	return (PyObject *)u;
				3328	}
				3329	if (u == unicode_empty) {
				3330	Py_DECREF(u);
				3331	return (PyObject *)v;
				3332	}
				3333
				3334	/* Concat the two Unicode strings */
				3335	w = _PyUnicode_New(u->length + v->length);
				3336	if (w == NULL)
				3337	goto onError;
				3338	Py_UNICODE_COPY(w->str, u->str, u->length);
				3339	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3340
				3341	Py_DECREF(u);
				3342	Py_DECREF(v);
				3343	return (PyObject *)w;
				3344
				3345	onError:
				3346	Py_XDECREF(u);
				3347	Py_XDECREF(v);
				3348	return NULL;
				3349	}
				3350
				3351	static char count__doc__[] =
				3352	"S.count(sub[, start[, end]]) -> int\n\
				3353	\n\
				3354	Return the number of occurrences of substring sub in Unicode string\n\
				3355	S[start:end]. Optional arguments start and end are\n\
				3356	interpreted as in slice notation.";
				3357
				3358	static PyObject *
				3359	unicode_count(PyUnicodeObject self, PyObject args)
				3360	{
				3361	PyUnicodeObject *substring;
				3362	int start = 0;
				3363	int end = INT_MAX;
				3364	PyObject *result;
				3365
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3366	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3367	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3368	return NULL;
				3369
				3370	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3371	(PyObject *)substring);
				3372	if (substring == NULL)
				3373	return NULL;
				3374
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3375	if (start < 0)
				3376	start += self->length;
				3377	if (start < 0)
				3378	start = 0;
				3379	if (end > self->length)
				3380	end = self->length;
				3381	if (end < 0)
				3382	end += self->length;
				3383	if (end < 0)
				3384	end = 0;
				3385
				3386	result = PyInt_FromLong((long) count(self, start, end, substring));
				3387
				3388	Py_DECREF(substring);
				3389	return result;
				3390	}
				3391
				3392	static char encode__doc__[] =
				3393	"S.encode([encoding[,errors]]) -> string\n\
				3394	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3395	Return an encoded string version of S. Default encoding is the current\n\
				3396	default string encoding. errors may be given to set a different error\n\
				3397	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3398	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3399
				3400	static PyObject *
				3401	unicode_encode(PyUnicodeObject self, PyObject args)
				3402	{
				3403	char *encoding = NULL;
				3404	char *errors = NULL;
				3405	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3406	return NULL;
				3407	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3408	}
				3409
				3410	static char expandtabs__doc__[] =
				3411	"S.expandtabs([tabsize]) -> unicode\n\
				3412	\n\
				3413	Return a copy of S where all tab characters are expanded using spaces.\n\
				3414	If tabsize is not given, a tab size of 8 characters is assumed.";
				3415
				3416	static PyObject*
				3417	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3418	{
				3419	Py_UNICODE *e;
				3420	Py_UNICODE *p;
				3421	Py_UNICODE *q;
				3422	int i, j;
				3423	PyUnicodeObject *u;
				3424	int tabsize = 8;
				3425
				3426	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3427	return NULL;
				3428
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3429	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3430	i = j = 0;
				3431	e = self->str + self->length;
				3432	for (p = self->str; p < e; p++)
				3433	if (*p == '\t') {
				3434	if (tabsize > 0)
				3435	j += tabsize - (j % tabsize);
				3436	}
				3437	else {
				3438	j++;
				3439	if (p == '\n' \|\| p == '\r') {
				3440	i += j;
				3441	j = 0;
				3442	}
				3443	}
				3444
				3445	/* Second pass: create output string and fill it */
				3446	u = _PyUnicode_New(i + j);
				3447	if (!u)
				3448	return NULL;
				3449
				3450	j = 0;
				3451	q = u->str;
				3452
				3453	for (p = self->str; p < e; p++)
				3454	if (*p == '\t') {
				3455	if (tabsize > 0) {
				3456	i = tabsize - (j % tabsize);
				3457	j += i;
				3458	while (i--)
				3459	*q++ = ' ';
				3460	}
				3461	}
				3462	else {
				3463	j++;
				3464	q++ = p;
				3465	if (p == '\n' \|\| p == '\r')
				3466	j = 0;
				3467	}
				3468
				3469	return (PyObject*) u;
				3470	}
				3471
				3472	static char find__doc__[] =
				3473	"S.find(sub [,start [,end]]) -> int\n\
				3474	\n\
				3475	Return the lowest index in S where substring sub is found,\n\
				3476	such that sub is contained within s[start,end]. Optional\n\
				3477	arguments start and end are interpreted as in slice notation.\n\
				3478	\n\
				3479	Return -1 on failure.";
				3480
				3481	static PyObject *
				3482	unicode_find(PyUnicodeObject self, PyObject args)
				3483	{
				3484	PyUnicodeObject *substring;
				3485	int start = 0;
				3486	int end = INT_MAX;
				3487	PyObject *result;
				3488
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3489	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3490	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3491	return NULL;
				3492	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3493	(PyObject *)substring);
				3494	if (substring == NULL)
				3495	return NULL;
				3496
				3497	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3498
				3499	Py_DECREF(substring);
				3500	return result;
				3501	}
				3502
				3503	static PyObject *
				3504	unicode_getitem(PyUnicodeObject *self, int index)
				3505	{
				3506	if (index < 0 \|\| index >= self->length) {
				3507	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3508	return NULL;
				3509	}
				3510
				3511	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3512	}
				3513
				3514	static long
				3515	unicode_hash(PyUnicodeObject *self)
				3516	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3517	/* Since Unicode objects compare equal to their ASCII string
				3518	counterparts, they should use the individual character values
				3519	as basis for their hash value. This is needed to assure that
				3520	strings and Unicode objects behave in the same way as
				3521	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3522
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3523	register int len;
				3524	register Py_UNICODE *p;
				3525	register long x;
				3526
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3527	if (self->hash != -1)
				3528	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3529	len = PyUnicode_GET_SIZE(self);
				3530	p = PyUnicode_AS_UNICODE(self);
				3531	x = *p << 7;
				3532	while (--len >= 0)
				3533	x = (1000003x) ^ p++;
				3534	x ^= PyUnicode_GET_SIZE(self);
				3535	if (x == -1)
				3536	x = -2;
				3537	self->hash = x;
				3538	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3539	}
				3540
				3541	static char index__doc__[] =
				3542	"S.index(sub [,start [,end]]) -> int\n\
				3543	\n\
				3544	Like S.find() but raise ValueError when the substring is not found.";
				3545
				3546	static PyObject *
				3547	unicode_index(PyUnicodeObject self, PyObject args)
				3548	{
				3549	int result;
				3550	PyUnicodeObject *substring;
				3551	int start = 0;
				3552	int end = INT_MAX;
				3553
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3554	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3555	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3556	return NULL;
				3557
				3558	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3559	(PyObject *)substring);
				3560	if (substring == NULL)
				3561	return NULL;
				3562
				3563	result = findstring(self, substring, start, end, 1);
				3564
				3565	Py_DECREF(substring);
				3566	if (result < 0) {
				3567	PyErr_SetString(PyExc_ValueError, "substring not found");
				3568	return NULL;
				3569	}
				3570	return PyInt_FromLong(result);
				3571	}
				3572
				3573	static char islower__doc__[] =
				3574	"S.islower() -> int\n\
				3575	\n\
				3576	Return 1 if all cased characters in S are lowercase and there is\n\
				3577	at least one cased character in S, 0 otherwise.";
				3578
				3579	static PyObject*
				3580	unicode_islower(PyUnicodeObject self, PyObject args)
				3581	{
				3582	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3583	register const Py_UNICODE *e;
				3584	int cased;
				3585
				3586	if (!PyArg_NoArgs(args))
				3587	return NULL;
				3588
				3589	/* Shortcut for single character strings */
				3590	if (PyUnicode_GET_SIZE(self) == 1)
				3591	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3592
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3593	/* Special case for empty strings */
				3594	if (PyString_GET_SIZE(self) == 0)
				3595	return PyInt_FromLong(0);
				3596
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3597	e = p + PyUnicode_GET_SIZE(self);
				3598	cased = 0;
				3599	for (; p < e; p++) {
				3600	register const Py_UNICODE ch = *p;
				3601
				3602	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3603	return PyInt_FromLong(0);
				3604	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3605	cased = 1;
				3606	}
				3607	return PyInt_FromLong(cased);
				3608	}
				3609
				3610	static char isupper__doc__[] =
				3611	"S.isupper() -> int\n\
				3612	\n\
				3613	Return 1 if all cased characters in S are uppercase and there is\n\
				3614	at least one cased character in S, 0 otherwise.";
				3615
				3616	static PyObject*
				3617	unicode_isupper(PyUnicodeObject self, PyObject args)
				3618	{
				3619	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3620	register const Py_UNICODE *e;
				3621	int cased;
				3622
				3623	if (!PyArg_NoArgs(args))
				3624	return NULL;
				3625
				3626	/* Shortcut for single character strings */
				3627	if (PyUnicode_GET_SIZE(self) == 1)
				3628	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3629
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3630	/* Special case for empty strings */
				3631	if (PyString_GET_SIZE(self) == 0)
				3632	return PyInt_FromLong(0);
				3633
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3634	e = p + PyUnicode_GET_SIZE(self);
				3635	cased = 0;
				3636	for (; p < e; p++) {
				3637	register const Py_UNICODE ch = *p;
				3638
				3639	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3640	return PyInt_FromLong(0);
				3641	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3642	cased = 1;
				3643	}
				3644	return PyInt_FromLong(cased);
				3645	}
				3646
				3647	static char istitle__doc__[] =
				3648	"S.istitle() -> int\n\
				3649	\n\
				3650	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3651	may only follow uncased characters and lowercase characters only cased\n\
				3652	ones. Return 0 otherwise.";
				3653
				3654	static PyObject*
				3655	unicode_istitle(PyUnicodeObject self, PyObject args)
				3656	{
				3657	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3658	register const Py_UNICODE *e;
				3659	int cased, previous_is_cased;
				3660
				3661	if (!PyArg_NoArgs(args))
				3662	return NULL;
				3663
				3664	/* Shortcut for single character strings */
				3665	if (PyUnicode_GET_SIZE(self) == 1)
				3666	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3667	(Py_UNICODE_ISUPPER(*p) != 0));
				3668
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3669	/* Special case for empty strings */
				3670	if (PyString_GET_SIZE(self) == 0)
				3671	return PyInt_FromLong(0);
				3672
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3673	e = p + PyUnicode_GET_SIZE(self);
				3674	cased = 0;
				3675	previous_is_cased = 0;
				3676	for (; p < e; p++) {
				3677	register const Py_UNICODE ch = *p;
				3678
				3679	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3680	if (previous_is_cased)
				3681	return PyInt_FromLong(0);
				3682	previous_is_cased = 1;
				3683	cased = 1;
				3684	}
				3685	else if (Py_UNICODE_ISLOWER(ch)) {
				3686	if (!previous_is_cased)
				3687	return PyInt_FromLong(0);
				3688	previous_is_cased = 1;
				3689	cased = 1;
				3690	}
				3691	else
				3692	previous_is_cased = 0;
				3693	}
				3694	return PyInt_FromLong(cased);
				3695	}
				3696
				3697	static char isspace__doc__[] =
				3698	"S.isspace() -> int\n\
				3699	\n\
				3700	Return 1 if there are only whitespace characters in S,\n\
				3701	0 otherwise.";
				3702
				3703	static PyObject*
				3704	unicode_isspace(PyUnicodeObject self, PyObject args)
				3705	{
				3706	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3707	register const Py_UNICODE *e;
				3708
				3709	if (!PyArg_NoArgs(args))
				3710	return NULL;
				3711
				3712	/* Shortcut for single character strings */
				3713	if (PyUnicode_GET_SIZE(self) == 1 &&
				3714	Py_UNICODE_ISSPACE(*p))
				3715	return PyInt_FromLong(1);
				3716
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3717	/* Special case for empty strings */
				3718	if (PyString_GET_SIZE(self) == 0)
				3719	return PyInt_FromLong(0);
				3720
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3721	e = p + PyUnicode_GET_SIZE(self);
				3722	for (; p < e; p++) {
				3723	if (!Py_UNICODE_ISSPACE(*p))
				3724	return PyInt_FromLong(0);
				3725	}
				3726	return PyInt_FromLong(1);
				3727	}
				3728
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3729	static char isalpha__doc__[] =
				3730	"S.isalpha() -> int\n\
				3731	\n\
				3732	Return 1 if all characters in S are alphabetic\n\
				3733	and there is at least one character in S, 0 otherwise.";
				3734
				3735	static PyObject*
				3736	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3737	{
				3738	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3739	register const Py_UNICODE *e;
				3740
				3741	if (!PyArg_NoArgs(args))
				3742	return NULL;
				3743
				3744	/* Shortcut for single character strings */
				3745	if (PyUnicode_GET_SIZE(self) == 1 &&
				3746	Py_UNICODE_ISALPHA(*p))
				3747	return PyInt_FromLong(1);
				3748
				3749	/* Special case for empty strings */
				3750	if (PyString_GET_SIZE(self) == 0)
				3751	return PyInt_FromLong(0);
				3752
				3753	e = p + PyUnicode_GET_SIZE(self);
				3754	for (; p < e; p++) {
				3755	if (!Py_UNICODE_ISALPHA(*p))
				3756	return PyInt_FromLong(0);
				3757	}
				3758	return PyInt_FromLong(1);
				3759	}
				3760
				3761	static char isalnum__doc__[] =
				3762	"S.isalnum() -> int\n\
				3763	\n\
				3764	Return 1 if all characters in S are alphanumeric\n\
				3765	and there is at least one character in S, 0 otherwise.";
				3766
				3767	static PyObject*
				3768	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3769	{
				3770	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3771	register const Py_UNICODE *e;
				3772
				3773	if (!PyArg_NoArgs(args))
				3774	return NULL;
				3775
				3776	/* Shortcut for single character strings */
				3777	if (PyUnicode_GET_SIZE(self) == 1 &&
				3778	Py_UNICODE_ISALNUM(*p))
				3779	return PyInt_FromLong(1);
				3780
				3781	/* Special case for empty strings */
				3782	if (PyString_GET_SIZE(self) == 0)
				3783	return PyInt_FromLong(0);
				3784
				3785	e = p + PyUnicode_GET_SIZE(self);
				3786	for (; p < e; p++) {
				3787	if (!Py_UNICODE_ISALNUM(*p))
				3788	return PyInt_FromLong(0);
				3789	}
				3790	return PyInt_FromLong(1);
				3791	}
				3792
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3793	static char isdecimal__doc__[] =
				3794	"S.isdecimal() -> int\n\
				3795	\n\
				3796	Return 1 if there are only decimal characters in S,\n\
				3797	0 otherwise.";
				3798
				3799	static PyObject*
				3800	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3801	{
				3802	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3803	register const Py_UNICODE *e;
				3804
				3805	if (!PyArg_NoArgs(args))
				3806	return NULL;
				3807
				3808	/* Shortcut for single character strings */
				3809	if (PyUnicode_GET_SIZE(self) == 1 &&
				3810	Py_UNICODE_ISDECIMAL(*p))
				3811	return PyInt_FromLong(1);
				3812
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3813	/* Special case for empty strings */
				3814	if (PyString_GET_SIZE(self) == 0)
				3815	return PyInt_FromLong(0);
				3816
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3817	e = p + PyUnicode_GET_SIZE(self);
				3818	for (; p < e; p++) {
				3819	if (!Py_UNICODE_ISDECIMAL(*p))
				3820	return PyInt_FromLong(0);
				3821	}
				3822	return PyInt_FromLong(1);
				3823	}
				3824
				3825	static char isdigit__doc__[] =
				3826	"S.isdigit() -> int\n\
				3827	\n\
				3828	Return 1 if there are only digit characters in S,\n\
				3829	0 otherwise.";
				3830
				3831	static PyObject*
				3832	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3833	{
				3834	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3835	register const Py_UNICODE *e;
				3836
				3837	if (!PyArg_NoArgs(args))
				3838	return NULL;
				3839
				3840	/* Shortcut for single character strings */
				3841	if (PyUnicode_GET_SIZE(self) == 1 &&
				3842	Py_UNICODE_ISDIGIT(*p))
				3843	return PyInt_FromLong(1);
				3844
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3845	/* Special case for empty strings */
				3846	if (PyString_GET_SIZE(self) == 0)
				3847	return PyInt_FromLong(0);
				3848
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3849	e = p + PyUnicode_GET_SIZE(self);
				3850	for (; p < e; p++) {
				3851	if (!Py_UNICODE_ISDIGIT(*p))
				3852	return PyInt_FromLong(0);
				3853	}
				3854	return PyInt_FromLong(1);
				3855	}
				3856
				3857	static char isnumeric__doc__[] =
				3858	"S.isnumeric() -> int\n\
				3859	\n\
				3860	Return 1 if there are only numeric characters in S,\n\
				3861	0 otherwise.";
				3862
				3863	static PyObject*
				3864	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3865	{
				3866	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3867	register const Py_UNICODE *e;
				3868
				3869	if (!PyArg_NoArgs(args))
				3870	return NULL;
				3871
				3872	/* Shortcut for single character strings */
				3873	if (PyUnicode_GET_SIZE(self) == 1 &&
				3874	Py_UNICODE_ISNUMERIC(*p))
				3875	return PyInt_FromLong(1);
				3876
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3877	/* Special case for empty strings */
				3878	if (PyString_GET_SIZE(self) == 0)
				3879	return PyInt_FromLong(0);
				3880
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3881	e = p + PyUnicode_GET_SIZE(self);
				3882	for (; p < e; p++) {
				3883	if (!Py_UNICODE_ISNUMERIC(*p))
				3884	return PyInt_FromLong(0);
				3885	}
				3886	return PyInt_FromLong(1);
				3887	}
				3888
				3889	static char join__doc__[] =
				3890	"S.join(sequence) -> unicode\n\
				3891	\n\
				3892	Return a string which is the concatenation of the strings in the\n\
				3893	sequence. The separator between elements is S.";
				3894
				3895	static PyObject*
				3896	unicode_join(PyUnicodeObject self, PyObject args)
				3897	{
				3898	PyObject *data;
				3899	if (!PyArg_ParseTuple(args, "O:join", &data))
				3900	return NULL;
				3901
				3902	return PyUnicode_Join((PyObject *)self, data);
				3903	}
				3904
				3905	static int
				3906	unicode_length(PyUnicodeObject *self)
				3907	{
				3908	return self->length;
				3909	}
				3910
				3911	static char ljust__doc__[] =
				3912	"S.ljust(width) -> unicode\n\
				3913	\n\
				3914	Return S left justified in a Unicode string of length width. Padding is\n\
				3915	done using spaces.";
				3916
				3917	static PyObject *
				3918	unicode_ljust(PyUnicodeObject self, PyObject args)
				3919	{
				3920	int width;
				3921	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3922	return NULL;
				3923
				3924	if (self->length >= width) {
				3925	Py_INCREF(self);
				3926	return (PyObject*) self;
				3927	}
				3928
				3929	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3930	}
				3931
				3932	static char lower__doc__[] =
				3933	"S.lower() -> unicode\n\
				3934	\n\
				3935	Return a copy of the string S converted to lowercase.";
				3936
				3937	static PyObject*
				3938	unicode_lower(PyUnicodeObject self, PyObject args)
				3939	{
				3940	if (!PyArg_NoArgs(args))
				3941	return NULL;
				3942	return fixup(self, fixlower);
				3943	}
				3944
				3945	static char lstrip__doc__[] =
				3946	"S.lstrip() -> unicode\n\
				3947	\n\
				3948	Return a copy of the string S with leading whitespace removed.";
				3949
				3950	static PyObject *
				3951	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3952	{
				3953	if (!PyArg_NoArgs(args))
				3954	return NULL;
				3955	return strip(self, 1, 0);
				3956	}
				3957
				3958	static PyObject*
				3959	unicode_repeat(PyUnicodeObject *str, int len)
				3960	{
				3961	PyUnicodeObject *u;
				3962	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3963	int nchars;
				3964	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3965
				3966	if (len < 0)
				3967	len = 0;
				3968
				3969	if (len == 1) {
				3970	/* no repeat, return original string */
				3971	Py_INCREF(str);
				3972	return (PyObject*) str;
				3973	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3974
				3975	/* ensure # of chars needed doesn't overflow int and # of bytes
				3976	* needed doesn't overflow size_t
				3977	*/
				3978	nchars = len * str->length;
				3979	if (len && nchars / len != str->length) {
				3980	PyErr_SetString(PyExc_OverflowError,
				3981	"repeated string is too long");
				3982	return NULL;
				3983	}
				3984	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				3985	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				3986	PyErr_SetString(PyExc_OverflowError,
				3987	"repeated string is too long");
				3988	return NULL;
				3989	}
				3990	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3991	if (!u)
				3992	return NULL;
				3993
				3994	p = u->str;
				3995
				3996	while (len-- > 0) {
				3997	Py_UNICODE_COPY(p, str->str, str->length);
				3998	p += str->length;
				3999	}
				4000
				4001	return (PyObject*) u;
				4002	}
				4003
				4004	PyObject PyUnicode_Replace(PyObject obj,
				4005	PyObject *subobj,
				4006	PyObject *replobj,
				4007	int maxcount)
				4008	{
				4009	PyObject *self;
				4010	PyObject *str1;
				4011	PyObject *str2;
				4012	PyObject *result;
				4013
				4014	self = PyUnicode_FromObject(obj);
				4015	if (self == NULL)
				4016	return NULL;
				4017	str1 = PyUnicode_FromObject(subobj);
				4018	if (str1 == NULL) {
				4019	Py_DECREF(self);
				4020	return NULL;
				4021	}
				4022	str2 = PyUnicode_FromObject(replobj);
				4023	if (str2 == NULL) {
				4024	Py_DECREF(self);
				4025	Py_DECREF(str1);
				4026	return NULL;
				4027	}
				4028	result = replace((PyUnicodeObject *)self,
				4029	(PyUnicodeObject *)str1,
				4030	(PyUnicodeObject *)str2,
				4031	maxcount);
				4032	Py_DECREF(self);
				4033	Py_DECREF(str1);
				4034	Py_DECREF(str2);
				4035	return result;
				4036	}
				4037
				4038	static char replace__doc__[] =
				4039	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4040	\n\
				4041	Return a copy of S with all occurrences of substring\n\
				4042	old replaced by new. If the optional argument maxsplit is\n\
				4043	given, only the first maxsplit occurrences are replaced.";
				4044
				4045	static PyObject*
				4046	unicode_replace(PyUnicodeObject self, PyObject args)
				4047	{
				4048	PyUnicodeObject *str1;
				4049	PyUnicodeObject *str2;
				4050	int maxcount = -1;
				4051	PyObject *result;
				4052
				4053	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4054	return NULL;
				4055	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4056	if (str1 == NULL)
				4057	return NULL;
				4058	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4059	if (str2 == NULL)
				4060	return NULL;
				4061
				4062	result = replace(self, str1, str2, maxcount);
				4063
				4064	Py_DECREF(str1);
				4065	Py_DECREF(str2);
				4066	return result;
				4067	}
				4068
				4069	static
				4070	PyObject unicode_repr(PyObject unicode)
				4071	{
				4072	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4073	PyUnicode_GET_SIZE(unicode),
				4074	1);
				4075	}
				4076
				4077	static char rfind__doc__[] =
				4078	"S.rfind(sub [,start [,end]]) -> int\n\
				4079	\n\
				4080	Return the highest index in S where substring sub is found,\n\
				4081	such that sub is contained within s[start,end]. Optional\n\
				4082	arguments start and end are interpreted as in slice notation.\n\
				4083	\n\
				4084	Return -1 on failure.";
				4085
				4086	static PyObject *
				4087	unicode_rfind(PyUnicodeObject self, PyObject args)
				4088	{
				4089	PyUnicodeObject *substring;
				4090	int start = 0;
				4091	int end = INT_MAX;
				4092	PyObject *result;
				4093
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4094	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4095	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4096	return NULL;
				4097	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4098	(PyObject *)substring);
				4099	if (substring == NULL)
				4100	return NULL;
				4101
				4102	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4103
				4104	Py_DECREF(substring);
				4105	return result;
				4106	}
				4107
				4108	static char rindex__doc__[] =
				4109	"S.rindex(sub [,start [,end]]) -> int\n\
				4110	\n\
				4111	Like S.rfind() but raise ValueError when the substring is not found.";
				4112
				4113	static PyObject *
				4114	unicode_rindex(PyUnicodeObject self, PyObject args)
				4115	{
				4116	int result;
				4117	PyUnicodeObject *substring;
				4118	int start = 0;
				4119	int end = INT_MAX;
				4120
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4121	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4122	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4123	return NULL;
				4124	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4125	(PyObject *)substring);
				4126	if (substring == NULL)
				4127	return NULL;
				4128
				4129	result = findstring(self, substring, start, end, -1);
				4130
				4131	Py_DECREF(substring);
				4132	if (result < 0) {
				4133	PyErr_SetString(PyExc_ValueError, "substring not found");
				4134	return NULL;
				4135	}
				4136	return PyInt_FromLong(result);
				4137	}
				4138
				4139	static char rjust__doc__[] =
				4140	"S.rjust(width) -> unicode\n\
				4141	\n\
				4142	Return S right justified in a Unicode string of length width. Padding is\n\
				4143	done using spaces.";
				4144
				4145	static PyObject *
				4146	unicode_rjust(PyUnicodeObject self, PyObject args)
				4147	{
				4148	int width;
				4149	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4150	return NULL;
				4151
				4152	if (self->length >= width) {
				4153	Py_INCREF(self);
				4154	return (PyObject*) self;
				4155	}
				4156
				4157	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4158	}
				4159
				4160	static char rstrip__doc__[] =
				4161	"S.rstrip() -> unicode\n\
				4162	\n\
				4163	Return a copy of the string S with trailing whitespace removed.";
				4164
				4165	static PyObject *
				4166	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4167	{
				4168	if (!PyArg_NoArgs(args))
				4169	return NULL;
				4170	return strip(self, 0, 1);
				4171	}
				4172
				4173	static PyObject*
				4174	unicode_slice(PyUnicodeObject *self, int start, int end)
				4175	{
				4176	/* standard clamping */
				4177	if (start < 0)
				4178	start = 0;
				4179	if (end < 0)
				4180	end = 0;
				4181	if (end > self->length)
				4182	end = self->length;
				4183	if (start == 0 && end == self->length) {
				4184	/* full slice, return original string */
				4185	Py_INCREF(self);
				4186	return (PyObject*) self;
				4187	}
				4188	if (start > end)
				4189	start = end;
				4190	/* copy slice */
				4191	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4192	end - start);
				4193	}
				4194
				4195	PyObject PyUnicode_Split(PyObject s,
				4196	PyObject *sep,
				4197	int maxsplit)
				4198	{
				4199	PyObject *result;
				4200
				4201	s = PyUnicode_FromObject(s);
				4202	if (s == NULL)
				4203	return NULL;
				4204	if (sep != NULL) {
				4205	sep = PyUnicode_FromObject(sep);
				4206	if (sep == NULL) {
				4207	Py_DECREF(s);
				4208	return NULL;
				4209	}
				4210	}
				4211
				4212	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4213
				4214	Py_DECREF(s);
				4215	Py_XDECREF(sep);
				4216	return result;
				4217	}
				4218
				4219	static char split__doc__[] =
				4220	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4221	\n\
				4222	Return a list of the words in S, using sep as the\n\
				4223	delimiter string. If maxsplit is given, at most maxsplit\n\
				4224	splits are done. If sep is not specified, any whitespace string\n\
				4225	is a separator.";
				4226
				4227	static PyObject*
				4228	unicode_split(PyUnicodeObject self, PyObject args)
				4229	{
				4230	PyObject *substring = Py_None;
				4231	int maxcount = -1;
				4232
				4233	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4234	return NULL;
				4235
				4236	if (substring == Py_None)
				4237	return split(self, NULL, maxcount);
				4238	else if (PyUnicode_Check(substring))
				4239	return split(self, (PyUnicodeObject *)substring, maxcount);
				4240	else
				4241	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4242	}
				4243
				4244	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4245	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4246	\n\
				4247	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4248	Line breaks are not included in the resulting list unless keepends\n\
				4249	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4250
				4251	static PyObject*
				4252	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4253	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4254	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4255
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4256	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4257	return NULL;
				4258
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4259	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4260	}
				4261
				4262	static
				4263	PyObject unicode_str(PyUnicodeObject self)
				4264	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4265	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4266	}
				4267
				4268	static char strip__doc__[] =
				4269	"S.strip() -> unicode\n\
				4270	\n\
				4271	Return a copy of S with leading and trailing whitespace removed.";
				4272
				4273	static PyObject *
				4274	unicode_strip(PyUnicodeObject self, PyObject args)
				4275	{
				4276	if (!PyArg_NoArgs(args))
				4277	return NULL;
				4278	return strip(self, 1, 1);
				4279	}
				4280
				4281	static char swapcase__doc__[] =
				4282	"S.swapcase() -> unicode\n\
				4283	\n\
				4284	Return a copy of S with uppercase characters converted to lowercase\n\
				4285	and vice versa.";
				4286
				4287	static PyObject*
				4288	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4289	{
				4290	if (!PyArg_NoArgs(args))
				4291	return NULL;
				4292	return fixup(self, fixswapcase);
				4293	}
				4294
				4295	static char translate__doc__[] =
				4296	"S.translate(table) -> unicode\n\
				4297	\n\
				4298	Return a copy of the string S, where all characters have been mapped\n\
				4299	through the given translation table, which must be a mapping of\n\
				4300	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4301	are left untouched. Characters mapped to None are deleted.";
				4302
				4303	static PyObject*
				4304	unicode_translate(PyUnicodeObject self, PyObject args)
				4305	{
				4306	PyObject *table;
				4307
				4308	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4309	return NULL;
				4310	return PyUnicode_TranslateCharmap(self->str,
				4311	self->length,
				4312	table,
				4313	"ignore");
				4314	}
				4315
				4316	static char upper__doc__[] =
				4317	"S.upper() -> unicode\n\
				4318	\n\
				4319	Return a copy of S converted to uppercase.";
				4320
				4321	static PyObject*
				4322	unicode_upper(PyUnicodeObject self, PyObject args)
				4323	{
				4324	if (!PyArg_NoArgs(args))
				4325	return NULL;
				4326	return fixup(self, fixupper);
				4327	}
				4328
				4329	#if 0
				4330	static char zfill__doc__[] =
				4331	"S.zfill(width) -> unicode\n\
				4332	\n\
				4333	Pad a numeric string x with zeros on the left, to fill a field\n\
				4334	of the specified width. The string x is never truncated.";
				4335
				4336	static PyObject *
				4337	unicode_zfill(PyUnicodeObject self, PyObject args)
				4338	{
				4339	int fill;
				4340	PyUnicodeObject *u;
				4341
				4342	int width;
				4343	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4344	return NULL;
				4345
				4346	if (self->length >= width) {
				4347	Py_INCREF(self);
				4348	return (PyObject*) self;
				4349	}
				4350
				4351	fill = width - self->length;
				4352
				4353	u = pad(self, fill, 0, '0');
				4354
				4355	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4356	/* move sign to beginning of string */
				4357	u->str[0] = u->str[fill];
				4358	u->str[fill] = '0';
				4359	}
				4360
				4361	return (PyObject*) u;
				4362	}
				4363	#endif
				4364
				4365	#if 0
				4366	static PyObject*
				4367	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4368	{
				4369	if (!PyArg_NoArgs(args))
				4370	return NULL;
				4371	return PyInt_FromLong(unicode_freelist_size);
				4372	}
				4373	#endif
				4374
				4375	static char startswith__doc__[] =
				4376	"S.startswith(prefix[, start[, end]]) -> int\n\
				4377	\n\
				4378	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4379	optional start, test S beginning at that position. With optional end, stop\n\
				4380	comparing S at that position.";
				4381
				4382	static PyObject *
				4383	unicode_startswith(PyUnicodeObject *self,
				4384	PyObject *args)
				4385	{
				4386	PyUnicodeObject *substring;
				4387	int start = 0;
				4388	int end = INT_MAX;
				4389	PyObject *result;
				4390
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4391	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4392	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4393	return NULL;
				4394	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4395	(PyObject *)substring);
				4396	if (substring == NULL)
				4397	return NULL;
				4398
				4399	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4400
				4401	Py_DECREF(substring);
				4402	return result;
				4403	}
				4404
				4405
				4406	static char endswith__doc__[] =
				4407	"S.endswith(suffix[, start[, end]]) -> int\n\
				4408	\n\
				4409	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4410	optional start, test S beginning at that position. With optional end, stop\n\
				4411	comparing S at that position.";
				4412
				4413	static PyObject *
				4414	unicode_endswith(PyUnicodeObject *self,
				4415	PyObject *args)
				4416	{
				4417	PyUnicodeObject *substring;
				4418	int start = 0;
				4419	int end = INT_MAX;
				4420	PyObject *result;
				4421
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4422	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4423	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4424	return NULL;
				4425	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4426	(PyObject *)substring);
				4427	if (substring == NULL)
				4428	return NULL;
				4429
				4430	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4431
				4432	Py_DECREF(substring);
				4433	return result;
				4434	}
				4435
				4436
				4437	static PyMethodDef unicode_methods[] = {
				4438
				4439	/* Order is according to common usage: often used methods should
				4440	appear first, since lookup is done sequentially. */
				4441
				4442	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4443	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4444	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4445	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4446	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4447	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4448	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4449	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4450	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4451	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4452	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4453	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4454	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4455	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4456	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4457	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4458	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4459	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4460	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4461	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4462	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4463	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4464	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4465	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4466	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4467	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4468	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4469	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4470	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4471	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4472	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4473	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4474	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4475	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4476	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4477	#if 0
				4478	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4479	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4480	#endif
				4481
				4482	#if 0
				4483	/* This one is just used for debugging the implementation. */
				4484	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4485	#endif
				4486
				4487	{NULL, NULL}
				4488	};
				4489
				4490	static PyObject *
				4491	unicode_getattr(PyUnicodeObject self, char name)
				4492	{
				4493	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4494	}
				4495
				4496	static PySequenceMethods unicode_as_sequence = {
				4497	(inquiry) unicode_length, /* sq_length */
				4498	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4499	(intargfunc) unicode_repeat, /* sq_repeat */
				4500	(intargfunc) unicode_getitem, /* sq_item */
				4501	(intintargfunc) unicode_slice, /* sq_slice */
				4502	0, /* sq_ass_item */
				4503	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4504	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4505	};
				4506
				4507	static int
				4508	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4509	int index,
				4510	const void **ptr)
				4511	{
				4512	if (index != 0) {
				4513	PyErr_SetString(PyExc_SystemError,
				4514	"accessing non-existent unicode segment");
				4515	return -1;
				4516	}
				4517	ptr = (void ) self->str;
				4518	return PyUnicode_GET_DATA_SIZE(self);
				4519	}
				4520
				4521	static int
				4522	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4523	const void **ptr)
				4524	{
				4525	PyErr_SetString(PyExc_TypeError,
				4526	"cannot use unicode as modifyable buffer");
				4527	return -1;
				4528	}
				4529
				4530	static int
				4531	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4532	int *lenp)
				4533	{
				4534	if (lenp)
				4535	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4536	return 1;
				4537	}
				4538
				4539	static int
				4540	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4541	int index,
				4542	const void **ptr)
				4543	{
				4544	PyObject *str;
				4545
				4546	if (index != 0) {
				4547	PyErr_SetString(PyExc_SystemError,
				4548	"accessing non-existent unicode segment");
				4549	return -1;
				4550	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4551	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4552	if (str == NULL)
				4553	return -1;
				4554	ptr = (void ) PyString_AS_STRING(str);
				4555	return PyString_GET_SIZE(str);
				4556	}
				4557
				4558	/* Helpers for PyUnicode_Format() */
				4559
				4560	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4561	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4562	{
				4563	int argidx = *p_argidx;
				4564	if (argidx < arglen) {
				4565	(*p_argidx)++;
				4566	if (arglen < 0)
				4567	return args;
				4568	else
				4569	return PyTuple_GetItem(args, argidx);
				4570	}
				4571	PyErr_SetString(PyExc_TypeError,
				4572	"not enough arguments for format string");
				4573	return NULL;
				4574	}
				4575
				4576	#define F_LJUST (1<<0)
				4577	#define F_SIGN (1<<1)
				4578	#define F_BLANK (1<<2)
				4579	#define F_ALT (1<<3)
				4580	#define F_ZERO (1<<4)
				4581
				4582	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4583	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4584	{
				4585	register int i;
				4586	int len;
				4587	va_list va;
				4588	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4589	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4590
				4591	/* First, format the string as char array, then expand to Py_UNICODE
				4592	array. */
				4593	charbuffer = (char *)buffer;
				4594	len = vsprintf(charbuffer, format, va);
				4595	for (i = len - 1; i >= 0; i--)
				4596	buffer[i] = (Py_UNICODE) charbuffer[i];
				4597
				4598	va_end(va);
				4599	return len;
				4600	}
				4601
				4602	static int
				4603	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4604	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4605	int flags,
				4606	int prec,
				4607	int type,
				4608	PyObject *v)
				4609	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4610	/* fmt = '%#.' + `prec` + `type`
				4611	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4612	char fmt[20];
				4613	double x;
				4614
				4615	x = PyFloat_AsDouble(v);
				4616	if (x == -1.0 && PyErr_Occurred())
				4617	return -1;
				4618	if (prec < 0)
				4619	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4620	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4621	type = 'g';
				4622	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4623	/* worst case length calc to ensure no buffer overrun:
				4624	fmt = %#.<prec>g
				4625	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4626	for any double rep.)
				4627	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4628	If prec=0 the effective precision is 1 (the leading digit is
				4629	always given), therefore increase by one to 10+prec. */
				4630	if (buflen <= (size_t)10 + (size_t)prec) {
				4631	PyErr_SetString(PyExc_OverflowError,
				4632	"formatted float is too long (precision too long?)");
				4633	return -1;
				4634	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4635	return usprintf(buf, fmt, x);
				4636	}
				4637
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4638	static PyObject*
				4639	formatlong(PyObject *val, int flags, int prec, int type)
				4640	{
				4641	char *buf;
				4642	int i, len;
				4643	PyObject str; / temporary string object. */
				4644	PyUnicodeObject *result;
				4645
				4646	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4647	if (!str)
				4648	return NULL;
				4649	result = _PyUnicode_New(len);
				4650	for (i = 0; i < len; i++)
				4651	result->str[i] = buf[i];
				4652	result->str[len] = 0;
				4653	Py_DECREF(str);
				4654	return (PyObject*)result;
				4655	}
				4656
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4657	static int
				4658	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4659	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4660	int flags,
				4661	int prec,
				4662	int type,
				4663	PyObject *v)
				4664	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4665	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4666	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4667	+ 1 + 1 = 24*/
				4668	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4669	long x;
				4670
				4671	x = PyInt_AsLong(v);
				4672	if (x == -1 && PyErr_Occurred())
				4673	return -1;
				4674	if (prec < 0)
				4675	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4676	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4677	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4678	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4679	PyErr_SetString(PyExc_OverflowError,
				4680	"formatted integer is too long (precision too long?)");
				4681	return -1;
				4682	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4683	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				4684	* but we want it (for consistency with other %#x conversions, and
				4685	* for consistency with Python's hex() function).
				4686	*/
				4687	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X'))
				4688	sprintf(fmt, "0%c%%%s.%dl%c", type, "#", prec, type);
				4689	else
				4690	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4691	return usprintf(buf, fmt, x);
				4692	}
				4693
				4694	static int
				4695	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4696	size_t buflen,
				4697	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4698	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4699	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4700	if (PyUnicode_Check(v)) {
				4701	if (PyUnicode_GET_SIZE(v) != 1)
				4702	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4703	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4704	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4705
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4706	else if (PyString_Check(v)) {
				4707	if (PyString_GET_SIZE(v) != 1)
				4708	goto onError;
				4709	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4710	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4711
				4712	else {
				4713	/* Integer input truncated to a character */
				4714	long x;
				4715	x = PyInt_AsLong(v);
				4716	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4717	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4718	buf[0] = (char) x;
				4719	}
				4720	buf[1] = '\0';
				4721	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4722
				4723	onError:
				4724	PyErr_SetString(PyExc_TypeError,
				4725	"%c requires int or char");
				4726	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4727	}
				4728
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4729	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4730
				4731	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4732	chars are formatted. XXX This is a magic number. Each formatting
				4733	routine does bounds checking to ensure no overflow, but a better
				4734	solution may be to malloc a buffer of appropriate size for each
				4735	format. For now, the current solution is sufficient.
				4736	*/
				4737	#define FORMATBUFLEN (size_t)120
				4738
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4739	PyObject PyUnicode_Format(PyObject format,
				4740	PyObject *args)
				4741	{
				4742	Py_UNICODE fmt, res;
				4743	int fmtcnt, rescnt, reslen, arglen, argidx;
				4744	int args_owned = 0;
				4745	PyUnicodeObject *result = NULL;
				4746	PyObject *dict = NULL;
				4747	PyObject *uformat;
				4748
				4749	if (format == NULL \|\| args == NULL) {
				4750	PyErr_BadInternalCall();
				4751	return NULL;
				4752	}
				4753	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4754	if (uformat == NULL)
				4755	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4756	fmt = PyUnicode_AS_UNICODE(uformat);
				4757	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4758
				4759	reslen = rescnt = fmtcnt + 100;
				4760	result = _PyUnicode_New(reslen);
				4761	if (result == NULL)
				4762	goto onError;
				4763	res = PyUnicode_AS_UNICODE(result);
				4764
				4765	if (PyTuple_Check(args)) {
				4766	arglen = PyTuple_Size(args);
				4767	argidx = 0;
				4768	}
				4769	else {
				4770	arglen = -1;
				4771	argidx = -2;
				4772	}
				4773	if (args->ob_type->tp_as_mapping)
				4774	dict = args;
				4775
				4776	while (--fmtcnt >= 0) {
				4777	if (*fmt != '%') {
				4778	if (--rescnt < 0) {
				4779	rescnt = fmtcnt + 100;
				4780	reslen += rescnt;
				4781	if (_PyUnicode_Resize(result, reslen) < 0)
				4782	return NULL;
				4783	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4784	--rescnt;
				4785	}
				4786	res++ = fmt++;
				4787	}
				4788	else {
				4789	/* Got a format specifier */
				4790	int flags = 0;
				4791	int width = -1;
				4792	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4793	Py_UNICODE c = '\0';
				4794	Py_UNICODE fill;
				4795	PyObject *v = NULL;
				4796	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4797	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4798	Py_UNICODE sign;
				4799	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4800	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4801
				4802	fmt++;
				4803	if (*fmt == '(') {
				4804	Py_UNICODE *keystart;
				4805	int keylen;
				4806	PyObject *key;
				4807	int pcount = 1;
				4808
				4809	if (dict == NULL) {
				4810	PyErr_SetString(PyExc_TypeError,
				4811	"format requires a mapping");
				4812	goto onError;
				4813	}
				4814	++fmt;
				4815	--fmtcnt;
				4816	keystart = fmt;
				4817	/* Skip over balanced parentheses */
				4818	while (pcount > 0 && --fmtcnt >= 0) {
				4819	if (*fmt == ')')
				4820	--pcount;
				4821	else if (*fmt == '(')
				4822	++pcount;
				4823	fmt++;
				4824	}
				4825	keylen = fmt - keystart - 1;
				4826	if (fmtcnt < 0 \|\| pcount > 0) {
				4827	PyErr_SetString(PyExc_ValueError,
				4828	"incomplete format key");
				4829	goto onError;
				4830	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4831	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4832	then looked up since Python uses strings to hold
				4833	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4834	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4835	key = PyUnicode_EncodeUTF8(keystart,
				4836	keylen,
				4837	NULL);
				4838	if (key == NULL)
				4839	goto onError;
				4840	if (args_owned) {
				4841	Py_DECREF(args);
				4842	args_owned = 0;
				4843	}
				4844	args = PyObject_GetItem(dict, key);
				4845	Py_DECREF(key);
				4846	if (args == NULL) {
				4847	goto onError;
				4848	}
				4849	args_owned = 1;
				4850	arglen = -1;
				4851	argidx = -2;
				4852	}
				4853	while (--fmtcnt >= 0) {
				4854	switch (c = *fmt++) {
				4855	case '-': flags \|= F_LJUST; continue;
				4856	case '+': flags \|= F_SIGN; continue;
				4857	case ' ': flags \|= F_BLANK; continue;
				4858	case '#': flags \|= F_ALT; continue;
				4859	case '0': flags \|= F_ZERO; continue;
				4860	}
				4861	break;
				4862	}
				4863	if (c == '*') {
				4864	v = getnextarg(args, arglen, &argidx);
				4865	if (v == NULL)
				4866	goto onError;
				4867	if (!PyInt_Check(v)) {
				4868	PyErr_SetString(PyExc_TypeError,
				4869	"* wants int");
				4870	goto onError;
				4871	}
				4872	width = PyInt_AsLong(v);
				4873	if (width < 0) {
				4874	flags \|= F_LJUST;
				4875	width = -width;
				4876	}
				4877	if (--fmtcnt >= 0)
				4878	c = *fmt++;
				4879	}
				4880	else if (c >= '0' && c <= '9') {
				4881	width = c - '0';
				4882	while (--fmtcnt >= 0) {
				4883	c = *fmt++;
				4884	if (c < '0' \|\| c > '9')
				4885	break;
				4886	if ((width*10) / 10 != width) {
				4887	PyErr_SetString(PyExc_ValueError,
				4888	"width too big");
				4889	goto onError;
				4890	}
				4891	width = width*10 + (c - '0');
				4892	}
				4893	}
				4894	if (c == '.') {
				4895	prec = 0;
				4896	if (--fmtcnt >= 0)
				4897	c = *fmt++;
				4898	if (c == '*') {
				4899	v = getnextarg(args, arglen, &argidx);
				4900	if (v == NULL)
				4901	goto onError;
				4902	if (!PyInt_Check(v)) {
				4903	PyErr_SetString(PyExc_TypeError,
				4904	"* wants int");
				4905	goto onError;
				4906	}
				4907	prec = PyInt_AsLong(v);
				4908	if (prec < 0)
				4909	prec = 0;
				4910	if (--fmtcnt >= 0)
				4911	c = *fmt++;
				4912	}
				4913	else if (c >= '0' && c <= '9') {
				4914	prec = c - '0';
				4915	while (--fmtcnt >= 0) {
				4916	c = Py_CHARMASK(*fmt++);
				4917	if (c < '0' \|\| c > '9')
				4918	break;
				4919	if ((prec*10) / 10 != prec) {
				4920	PyErr_SetString(PyExc_ValueError,
				4921	"prec too big");
				4922	goto onError;
				4923	}
				4924	prec = prec*10 + (c - '0');
				4925	}
				4926	}
				4927	} /* prec */
				4928	if (fmtcnt >= 0) {
				4929	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4930	if (--fmtcnt >= 0)
				4931	c = *fmt++;
				4932	}
				4933	}
				4934	if (fmtcnt < 0) {
				4935	PyErr_SetString(PyExc_ValueError,
				4936	"incomplete format");
				4937	goto onError;
				4938	}
				4939	if (c != '%') {
				4940	v = getnextarg(args, arglen, &argidx);
				4941	if (v == NULL)
				4942	goto onError;
				4943	}
				4944	sign = 0;
				4945	fill = ' ';
				4946	switch (c) {
				4947
				4948	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4949	pbuf = formatbuf;
				4950	/* presume that buffer length is at least 1 */
				4951	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4952	len = 1;
				4953	break;
				4954
				4955	case 's':
				4956	case 'r':
				4957	if (PyUnicode_Check(v) && c == 's') {
				4958	temp = v;
				4959	Py_INCREF(temp);
				4960	}
				4961	else {
				4962	PyObject *unicode;
				4963	if (c == 's')
				4964	temp = PyObject_Str(v);
				4965	else
				4966	temp = PyObject_Repr(v);
				4967	if (temp == NULL)
				4968	goto onError;
				4969	if (!PyString_Check(temp)) {
				4970	/* XXX Note: this should never happen, since
				4971	PyObject_Repr() and PyObject_Str() assure
				4972	this */
				4973	Py_DECREF(temp);
				4974	PyErr_SetString(PyExc_TypeError,
				4975	"%s argument has non-string str()");
				4976	goto onError;
				4977	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4978	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4979	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4980	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4981	"strict");
				4982	Py_DECREF(temp);
				4983	temp = unicode;
				4984	if (temp == NULL)
				4985	goto onError;
				4986	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4987	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4988	len = PyUnicode_GET_SIZE(temp);
				4989	if (prec >= 0 && len > prec)
				4990	len = prec;
				4991	break;
				4992
				4993	case 'i':
				4994	case 'd':
				4995	case 'u':
				4996	case 'o':
				4997	case 'x':
				4998	case 'X':
				4999	if (c == 'i')
				5000	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5001	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5002	temp = formatlong(v, flags, prec, c);
				5003	if (!temp)
				5004	goto onError;
				5005	pbuf = PyUnicode_AS_UNICODE(temp);
				5006	len = PyUnicode_GET_SIZE(temp);
				5007	/* unbounded ints can always produce
				5008	a sign character! */
				5009	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5010	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5011	else {
				5012	pbuf = formatbuf;
				5013	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5014	flags, prec, c, v);
				5015	if (len < 0)
				5016	goto onError;
				5017	/* only d conversion is signed */
				5018	sign = c == 'd';
				5019	}
				5020	if (flags & F_ZERO)
				5021	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5022	break;
				5023
				5024	case 'e':
				5025	case 'E':
				5026	case 'f':
				5027	case 'g':
				5028	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5029	pbuf = formatbuf;
				5030	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5031	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5032	if (len < 0)
				5033	goto onError;
				5034	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5035	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5036	fill = '0';
				5037	break;
				5038
				5039	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5040	pbuf = formatbuf;
				5041	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5042	if (len < 0)
				5043	goto onError;
				5044	break;
				5045
				5046	default:
				5047	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5048	"unsupported format character '%c' (0x%x) "
				5049	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5050	(31<=c && c<=126) ? c : '?',
				5051	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5052	goto onError;
				5053	}
				5054	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5055	if (pbuf == '-' \|\| pbuf == '+') {
				5056	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5057	len--;
				5058	}
				5059	else if (flags & F_SIGN)
				5060	sign = '+';
				5061	else if (flags & F_BLANK)
				5062	sign = ' ';
				5063	else
				5064	sign = 0;
				5065	}
				5066	if (width < len)
				5067	width = len;
				5068	if (rescnt < width + (sign != 0)) {
				5069	reslen -= rescnt;
				5070	rescnt = width + fmtcnt + 100;
				5071	reslen += rescnt;
				5072	if (_PyUnicode_Resize(result, reslen) < 0)
				5073	return NULL;
				5074	res = PyUnicode_AS_UNICODE(result)
				5075	+ reslen - rescnt;
				5076	}
				5077	if (sign) {
				5078	if (fill != ' ')
				5079	*res++ = sign;
				5080	rescnt--;
				5081	if (width > len)
				5082	width--;
				5083	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5084	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5085	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5086	assert(pbuf[1] == c);
				5087	if (fill != ' ') {
				5088	res++ = pbuf++;
				5089	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5090	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5091	rescnt -= 2;
				5092	width -= 2;
				5093	if (width < 0)
				5094	width = 0;
				5095	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5096	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5097	if (width > len && !(flags & F_LJUST)) {
				5098	do {
				5099	--rescnt;
				5100	*res++ = fill;
				5101	} while (--width > len);
				5102	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5103	if (fill == ' ') {
				5104	if (sign)
				5105	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5106	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5107	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5108	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5109	res++ = pbuf++;
				5110	res++ = pbuf++;
				5111	}
				5112	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5113	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5114	res += len;
				5115	rescnt -= len;
				5116	while (--width >= len) {
				5117	--rescnt;
				5118	*res++ = ' ';
				5119	}
				5120	if (dict && (argidx < arglen) && c != '%') {
				5121	PyErr_SetString(PyExc_TypeError,
				5122	"not all arguments converted");
				5123	goto onError;
				5124	}
				5125	Py_XDECREF(temp);
				5126	} /* '%' */
				5127	} /* until end */
				5128	if (argidx < arglen && !dict) {
				5129	PyErr_SetString(PyExc_TypeError,
				5130	"not all arguments converted");
				5131	goto onError;
				5132	}
				5133
				5134	if (args_owned) {
				5135	Py_DECREF(args);
				5136	}
				5137	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5138	if (_PyUnicode_Resize(result, reslen - rescnt))
				5139	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5140	return (PyObject *)result;
				5141
				5142	onError:
				5143	Py_XDECREF(result);
				5144	Py_DECREF(uformat);
				5145	if (args_owned) {
				5146	Py_DECREF(args);
				5147	}
				5148	return NULL;
				5149	}
				5150
				5151	static PyBufferProcs unicode_as_buffer = {
				5152	(getreadbufferproc) unicode_buffer_getreadbuf,
				5153	(getwritebufferproc) unicode_buffer_getwritebuf,
				5154	(getsegcountproc) unicode_buffer_getsegcount,
				5155	(getcharbufferproc) unicode_buffer_getcharbuf,
				5156	};
				5157
				5158	PyTypeObject PyUnicode_Type = {
				5159	PyObject_HEAD_INIT(&PyType_Type)
				5160	0, /* ob_size */
				5161	"unicode", /* tp_name */
				5162	sizeof(PyUnicodeObject), /* tp_size */
				5163	0, /* tp_itemsize */
				5164	/* Slots */
				5165	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5166	0, /* tp_print */
				5167	(getattrfunc)unicode_getattr, /* tp_getattr */
				5168	0, /* tp_setattr */
				5169	(cmpfunc) unicode_compare, /* tp_compare */
				5170	(reprfunc) unicode_repr, /* tp_repr */
				5171	0, /* tp_as_number */
				5172	&unicode_as_sequence, /* tp_as_sequence */
				5173	0, /* tp_as_mapping */
				5174	(hashfunc) unicode_hash, /* tp_hash*/
				5175	0, /* tp_call*/
				5176	(reprfunc) unicode_str, /* tp_str */
				5177	(getattrofunc) NULL, /* tp_getattro */
				5178	(setattrofunc) NULL, /* tp_setattro */
				5179	&unicode_as_buffer, /* tp_as_buffer */
				5180	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5181	};
				5182
				5183	/* Initialize the Unicode implementation */
				5184
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5185	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5186	{
				5187	/* Doublecheck the configuration... */
				5188	if (sizeof(Py_UNICODE) != 2)
				5189	Py_FatalError("Unicode configuration error: "
				5190	"sizeof(Py_UNICODE) != 2 bytes");
				5191
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5192	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5193	unicode_freelist = NULL;
				5194	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5195	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5196	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5197	}
				5198
				5199	/* Finalize the Unicode implementation */
				5200
				5201	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5202	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5203	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5204	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5205
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5206	Py_XDECREF(unicode_empty);
				5207	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5208
				5209	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5210	PyUnicodeObject *v = u;
				5211	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5212	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5213	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5214	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5215	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5216	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5217	unicode_freelist = NULL;
				5218	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5219	}