Blame - Objects/unicodeobject.c - platform/external/python/cpython2

blob: a06c40b9d604f3d99c1d553d40c8b7ca95c8fd83 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
				86	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	88
				89	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	90	static PyUnicodeObject *unicode_freelist;
				91	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	93	/* Default encoding to use and assume when NULL is passed as encoding
				94	parameter; it is initialized by _PyUnicode_Init().
				95
				96	Always use the PyUnicode_SetDefaultEncoding() and
				97	PyUnicode_GetDefaultEncoding() APIs to access this global.
				98
				99	*/
				100
				101	static char unicode_default_encoding[100];
				102
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103	/* --- Unicode Object ----------------------------------------------------- */
				104
				105	static
				106	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				107	int length)
				108	{
				109	void *oldstr;
				110
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	111	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	112	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	113	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Resizing unicode_empty is not allowed. */
				116	if (unicode == unicode_empty) {
				117	PyErr_SetString(PyExc_SystemError,
				118	"can't resize empty unicode object");
				119	return -1;
				120	}
				121
				122	/* We allocate one more byte to make sure the string is
				123	Ux0000 terminated -- XXX is this needed ? */
				124	oldstr = unicode->str;
				125	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				126	if (!unicode->str) {
				127	unicode->str = oldstr;
				128	PyErr_NoMemory();
				129	return -1;
				130	}
				131	unicode->str[length] = 0;
				132	unicode->length = length;
				133
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	134	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	136	if (unicode->defenc) {
				137	Py_DECREF(unicode->defenc);
				138	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	}
				140	unicode->hash = -1;
				141
				142	return 0;
				143	}
				144
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	int PyUnicode_Resize(PyObject **unicode,
				146	int length)
				147	{
				148	PyUnicodeObject *v;
				149
				150	if (unicode == NULL) {
				151	PyErr_BadInternalCall();
				152	return -1;
				153	}
				154	v = (PyUnicodeObject )unicode;
				155	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				156	PyErr_BadInternalCall();
				157	return -1;
				158	}
				159	return _PyUnicode_Resize(v, length);
				160	}
				161
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	162	/* We allocate one more byte to make sure the string is
				163	Ux0000 terminated -- XXX is this needed ?
				164
				165	XXX This allocator could further be enhanced by assuring that the
				166	free list never reduces its size below 1.
				167
				168	*/
				169
				170	static
				171	PyUnicodeObject *_PyUnicode_New(int length)
				172	{
				173	register PyUnicodeObject *unicode;
				174
				175	/* Optimization for empty strings */
				176	if (length == 0 && unicode_empty != NULL) {
				177	Py_INCREF(unicode_empty);
				178	return unicode_empty;
				179	}
				180
				181	/* Unicode freelist & memory allocation */
				182	if (unicode_freelist) {
				183	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	184	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	185	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	186	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	187	/* Keep-Alive optimization: we only upsize the buffer,
				188	never downsize it. */
				189	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	190	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	191	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	192	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	193	}
				194	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	195	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	197	}
				198	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	}
				200	else {
				201	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				202	if (unicode == NULL)
				203	return NULL;
				204	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				205	}
				206
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	207	if (!unicode->str) {
				208	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	209	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode->str[length] = 0;
				212	unicode->length = length;
				213	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	214	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	215	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	216
				217	onError:
				218	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	219	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	220	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	}
				222
				223	static
				224	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				225	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	227	/* Keep-Alive optimization */
				228	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	229	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	230	unicode->str = NULL;
				231	unicode->length = 0;
				232	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	233	if (unicode->defenc) {
				234	Py_DECREF(unicode->defenc);
				235	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	236	}
				237	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	(PyUnicodeObject *)unicode = unicode_freelist;
				239	unicode_freelist = unicode;
				240	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	}
				242	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	243	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	244	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	}
				247	}
				248
				249	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				250	int size)
				251	{
				252	PyUnicodeObject *unicode;
				253
				254	unicode = _PyUnicode_New(size);
				255	if (!unicode)
				256	return NULL;
				257
				258	/* Copy the Unicode data into the new object */
				259	if (u != NULL)
				260	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				261
				262	return (PyObject *)unicode;
				263	}
				264
				265	#ifdef HAVE_WCHAR_H
				266
				267	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				268	int size)
				269	{
				270	PyUnicodeObject *unicode;
				271
				272	if (w == NULL) {
				273	PyErr_BadInternalCall();
				274	return NULL;
				275	}
				276
				277	unicode = _PyUnicode_New(size);
				278	if (!unicode)
				279	return NULL;
				280
				281	/* Copy the wchar_t data into the new object */
				282	#ifdef HAVE_USABLE_WCHAR_T
				283	memcpy(unicode->str, w, size * sizeof(wchar_t));
				284	#else
				285	{
				286	register Py_UNICODE *u;
				287	register int i;
				288	u = PyUnicode_AS_UNICODE(unicode);
				289	for (i = size; i >= 0; i--)
				290	u++ = w++;
				291	}
				292	#endif
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				298	register wchar_t *w,
				299	int size)
				300	{
				301	if (unicode == NULL) {
				302	PyErr_BadInternalCall();
				303	return -1;
				304	}
				305	if (size > PyUnicode_GET_SIZE(unicode))
				306	size = PyUnicode_GET_SIZE(unicode);
				307	#ifdef HAVE_USABLE_WCHAR_T
				308	memcpy(w, unicode->str, size * sizeof(wchar_t));
				309	#else
				310	{
				311	register Py_UNICODE *u;
				312	register int i;
				313	u = PyUnicode_AS_UNICODE(unicode);
				314	for (i = size; i >= 0; i--)
				315	w++ = u++;
				316	}
				317	#endif
				318
				319	return size;
				320	}
				321
				322	#endif
				323
				324	PyObject PyUnicode_FromObject(register PyObject obj)
				325	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	326	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				327	}
				328
				329	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				330	const char *encoding,
				331	const char *errors)
				332	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333	const char *s;
				334	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	335	int owned = 0;
				336	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	337
				338	if (obj == NULL) {
				339	PyErr_BadInternalCall();
				340	return NULL;
				341	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	342
				343	/* Coerce object */
				344	if (PyInstance_Check(obj)) {
				345	PyObject *func;
				346	func = PyObject_GetAttrString(obj, "__str__");
				347	if (func == NULL) {
				348	PyErr_SetString(PyExc_TypeError,
				349	"coercing to Unicode: instance doesn't define __str__");
				350	return NULL;
				351	}
				352	obj = PyEval_CallObject(func, NULL);
				353	Py_DECREF(func);
				354	if (obj == NULL)
				355	return NULL;
				356	owned = 1;
				357	}
				358	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	360	v = obj;
				361	if (encoding) {
				362	PyErr_SetString(PyExc_TypeError,
				363	"decoding Unicode is not supported");
				364	return NULL;
				365	}
				366	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	376	PyErr_Format(PyExc_TypeError,
				377	"coercing to Unicode: need string or buffer, "
				378	"%.80s found",
				379	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	380	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	381	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	382
				383	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	384	if (len == 0) {
				385	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	387	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	388	else
				389	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	390
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	391	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	392	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	394	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	395	return v;
				396
				397	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	398	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	399	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	400	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	401	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	}
				403
				404	PyObject PyUnicode_Decode(const char s,
				405	int size,
				406	const char *encoding,
				407	const char *errors)
				408	{
				409	PyObject buffer = NULL, unicode;
				410
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	411	if (encoding == NULL)
				412	encoding = PyUnicode_GetDefaultEncoding();
				413
				414	/* Shortcuts for common default encodings */
				415	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	417	else if (strcmp(encoding, "latin-1") == 0)
				418	return PyUnicode_DecodeLatin1(s, size, errors);
				419	else if (strcmp(encoding, "ascii") == 0)
				420	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	421
				422	/* Decode via the codec registry */
				423	buffer = PyBuffer_FromMemory((void *)s, size);
				424	if (buffer == NULL)
				425	goto onError;
				426	unicode = PyCodec_Decode(buffer, encoding, errors);
				427	if (unicode == NULL)
				428	goto onError;
				429	if (!PyUnicode_Check(unicode)) {
				430	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	431	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	432	unicode->ob_type->tp_name);
				433	Py_DECREF(unicode);
				434	goto onError;
				435	}
				436	Py_DECREF(buffer);
				437	return unicode;
				438
				439	onError:
				440	Py_XDECREF(buffer);
				441	return NULL;
				442	}
				443
				444	PyObject PyUnicode_Encode(const Py_UNICODE s,
				445	int size,
				446	const char *encoding,
				447	const char *errors)
				448	{
				449	PyObject v, unicode;
				450
				451	unicode = PyUnicode_FromUnicode(s, size);
				452	if (unicode == NULL)
				453	return NULL;
				454	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				455	Py_DECREF(unicode);
				456	return v;
				457	}
				458
				459	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				460	const char *encoding,
				461	const char *errors)
				462	{
				463	PyObject *v;
				464
				465	if (!PyUnicode_Check(unicode)) {
				466	PyErr_BadArgument();
				467	goto onError;
				468	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	469
				470	if (encoding == NULL)
				471	encoding = PyUnicode_GetDefaultEncoding();
				472
				473	/* Shortcuts for common default encodings */
				474	if (errors == NULL) {
				475	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	477	else if (strcmp(encoding, "latin-1") == 0)
				478	return PyUnicode_AsLatin1String(unicode);
				479	else if (strcmp(encoding, "ascii") == 0)
				480	return PyUnicode_AsASCIIString(unicode);
				481	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	482
				483	/* Encode via the codec registry */
				484	v = PyCodec_Encode(unicode, encoding, errors);
				485	if (v == NULL)
				486	goto onError;
				487	/* XXX Should we really enforce this ? */
				488	if (!PyString_Check(v)) {
				489	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	490	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	v->ob_type->tp_name);
				492	Py_DECREF(v);
				493	goto onError;
				494	}
				495	return v;
				496
				497	onError:
				498	return NULL;
				499	}
				500
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	501	/* Return a Python string holding the default encoded value of the
				502	Unicode object.
				503
				504	The resulting string is cached in the Unicode object for subsequent
				505	usage by this function. The cached version is needed to implement
				506	the character buffer interface and will live (at least) as long as
				507	the Unicode object itself.
				508
				509	The refcount of the string is not incremented.
				510
				511	* Exported for internal use by the interpreter only !!! *
				512
				513	*/
				514
				515	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				516	const char *errors)
				517	{
				518	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				519
				520	if (v)
				521	return v;
				522	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				523	if (v && errors == NULL)
				524	((PyUnicodeObject *)unicode)->defenc = v;
				525	return v;
				526	}
				527
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	528	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				529	{
				530	if (!PyUnicode_Check(unicode)) {
				531	PyErr_BadArgument();
				532	goto onError;
				533	}
				534	return PyUnicode_AS_UNICODE(unicode);
				535
				536	onError:
				537	return NULL;
				538	}
				539
				540	int PyUnicode_GetSize(PyObject *unicode)
				541	{
				542	if (!PyUnicode_Check(unicode)) {
				543	PyErr_BadArgument();
				544	goto onError;
				545	}
				546	return PyUnicode_GET_SIZE(unicode);
				547
				548	onError:
				549	return -1;
				550	}
				551
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	552	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	{
				554	return unicode_default_encoding;
				555	}
				556
				557	int PyUnicode_SetDefaultEncoding(const char *encoding)
				558	{
				559	PyObject *v;
				560
				561	/* Make sure the encoding is valid. As side effect, this also
				562	loads the encoding into the codec registry cache. */
				563	v = _PyCodec_Lookup(encoding);
				564	if (v == NULL)
				565	goto onError;
				566	Py_DECREF(v);
				567	strncpy(unicode_default_encoding,
				568	encoding,
				569	sizeof(unicode_default_encoding));
				570	return 0;
				571
				572	onError:
				573	return -1;
				574	}
				575
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	576	/* --- UTF-8 Codec -------------------------------------------------------- */
				577
				578	static
				579	char utf8_code_length[256] = {
				580	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				581	illegal prefix. see RFC 2279 for details */
				582	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				583	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				584	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				591	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				592	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				595	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				596	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				597	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				598	};
				599
				600	static
				601	int utf8_decoding_error(const char **source,
				602	Py_UNICODE **dest,
				603	const char *errors,
				604	const char *details)
				605	{
				606	if ((errors == NULL) \|\|
				607	(strcmp(errors,"strict") == 0)) {
				608	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	609	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	610	details);
				611	return -1;
				612	}
				613	else if (strcmp(errors,"ignore") == 0) {
				614	(*source)++;
				615	return 0;
				616	}
				617	else if (strcmp(errors,"replace") == 0) {
				618	(*source)++;
				619	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				620	(*dest)++;
				621	return 0;
				622	}
				623	else {
				624	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	625	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	626	errors);
				627	return -1;
				628	}
				629	}
				630
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	631	PyObject PyUnicode_DecodeUTF8(const char s,
				632	int size,
				633	const char *errors)
				634	{
				635	int n;
				636	const char *e;
				637	PyUnicodeObject *unicode;
				638	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	639	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	640
				641	/* Note: size will always be longer than the resulting Unicode
				642	character count */
				643	unicode = _PyUnicode_New(size);
				644	if (!unicode)
				645	return NULL;
				646	if (size == 0)
				647	return (PyObject *)unicode;
				648
				649	/* Unpack UTF-8 encoded data */
				650	p = unicode->str;
				651	e = s + size;
				652
				653	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	654	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	655
				656	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	657	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	658	s++;
				659	continue;
				660	}
				661
				662	n = utf8_code_length[ch];
				663
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	if (s + n > e) {
				665	errmsg = "unexpected end of data";
				666	goto utf8Error;
				667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	668
				669	switch (n) {
				670
				671	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	672	errmsg = "unexpected code byte";
				673	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	674	break;
				675
				676	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	677	errmsg = "internal error";
				678	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	679	break;
				680
				681	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	682	if ((s[1] & 0xc0) != 0x80) {
				683	errmsg = "invalid data";
				684	goto utf8Error;
				685	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	687	if (ch < 0x80) {
				688	errmsg = "illegal encoding";
				689	goto utf8Error;
				690	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	692	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693	break;
				694
				695	case 3:
				696	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	(s[2] & 0xc0) != 0x80) {
				698	errmsg = "invalid data";
				699	goto utf8Error;
				700	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	701	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				703	errmsg = "illegal encoding";
				704	goto utf8Error;
				705	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	706	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	707	*p++ = (Py_UNICODE)ch;
				708	break;
				709
				710	case 4:
				711	if ((s[1] & 0xc0) != 0x80 \|\|
				712	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	(s[3] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	717	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				718	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				719	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	720	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				721	byte encoding */
				722	(ch > 0x10ffff)) { /* maximum value allowed for
				723	UTF-16 */
				724	errmsg = "illegal encoding";
				725	goto utf8Error;
				726	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	727	/* compute and append the two surrogates: */
				728
				729	/* translate from 10000..10FFFF to 0..FFFF */
				730	ch -= 0x10000;
				731
				732	/* high surrogate = top 10 bits added to D800 */
				733	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				734
				735	/* low surrogate = bottom 10 bits added to DC00 */
				736	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	break;
				738
				739	default:
				740	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	741	errmsg = "unsupported Unicode code range";
				742	goto utf8Error;
				743	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	744	}
				745	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	continue;
				747
				748	utf8Error:
				749	if (utf8_decoding_error(&s, &p, errors, errmsg))
				750	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	751	}
				752
				753	/* Adjust length */
				754	if (_PyUnicode_Resize(unicode, p - unicode->str))
				755	goto onError;
				756
				757	return (PyObject *)unicode;
				758
				759	onError:
				760	Py_DECREF(unicode);
				761	return NULL;
				762	}
				763
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	764	/* Not used anymore, now that the encoder supports UTF-16
				765	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	766	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	static
				768	int utf8_encoding_error(const Py_UNICODE **source,
				769	char **dest,
				770	const char *errors,
				771	const char *details)
				772	{
				773	if ((errors == NULL) \|\|
				774	(strcmp(errors,"strict") == 0)) {
				775	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	776	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	777	details);
				778	return -1;
				779	}
				780	else if (strcmp(errors,"ignore") == 0) {
				781	return 0;
				782	}
				783	else if (strcmp(errors,"replace") == 0) {
				784	**dest = '?';
				785	(*dest)++;
				786	return 0;
				787	}
				788	else {
				789	PyErr_Format(PyExc_ValueError,
				790	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	791	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	errors);
				793	return -1;
				794	}
				795	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	796	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	797
				798	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				799	int size,
				800	const char *errors)
				801	{
				802	PyObject *v;
				803	char *p;
				804	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	805	Py_UCS4 ch2;
				806	unsigned int cbAllocated = 3 * size;
				807	unsigned int cbWritten = 0;
				808	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	809
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	810	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	811	if (v == NULL)
				812	return NULL;
				813	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	814	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	815
				816	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	817	while (i < size) {
				818	Py_UCS4 ch = s[i++];
				819	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	820	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	821	cbWritten++;
				822	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	else if (ch < 0x0800) {
				824	*p++ = 0xc0 \| (ch >> 6);
				825	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	826	cbWritten += 2;
				827	}
				828	else {
				829	/* Check for high surrogate */
				830	if (0xD800 <= ch && ch <= 0xDBFF) {
				831	if (i != size) {
				832	ch2 = s[i];
				833	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				834
				835	if (cbWritten >= (cbAllocated - 4)) {
				836	/* Provide enough room for some more
				837	surrogates */
				838	cbAllocated += 4*10;
				839	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	840	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	}
				842
				843	/* combine the two values */
				844	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				845
				846	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	847	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	i++;
				849	cbWritten += 4;
				850	}
				851	}
				852	}
				853	else {
				854	*p++ = (char)(0xe0 \| (ch >> 12));
				855	cbWritten += 3;
				856	}
				857	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				858	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	859	}
				860	}
				861	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	862	if (_PyString_Resize(&v, p - q))
				863	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	864	return v;
				865
				866	onError:
				867	Py_DECREF(v);
				868	return NULL;
				869	}
				870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	871	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				872	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	873	if (!PyUnicode_Check(unicode)) {
				874	PyErr_BadArgument();
				875	return NULL;
				876	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	877	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				878	PyUnicode_GET_SIZE(unicode),
				879	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	880	}
				881
				882	/* --- UTF-16 Codec ------------------------------------------------------- */
				883
				884	static
				885	int utf16_decoding_error(const Py_UNICODE **source,
				886	Py_UNICODE **dest,
				887	const char *errors,
				888	const char *details)
				889	{
				890	if ((errors == NULL) \|\|
				891	(strcmp(errors,"strict") == 0)) {
				892	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	893	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	894	details);
				895	return -1;
				896	}
				897	else if (strcmp(errors,"ignore") == 0) {
				898	return 0;
				899	}
				900	else if (strcmp(errors,"replace") == 0) {
				901	if (dest) {
				902	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				903	(*dest)++;
				904	}
				905	return 0;
				906	}
				907	else {
				908	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	909	"UTF-16 decoding error; "
				910	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	911	errors);
				912	return -1;
				913	}
				914	}
				915
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	916	PyObject PyUnicode_DecodeUTF16(const char s,
				917	int size,
				918	const char *errors,
				919	int *byteorder)
				920	{
				921	PyUnicodeObject *unicode;
				922	Py_UNICODE *p;
				923	const Py_UNICODE q, e;
				924	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	925	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	926
				927	/* size should be an even number */
				928	if (size % sizeof(Py_UNICODE) != 0) {
				929	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				930	return NULL;
				931	/* The remaining input chars are ignored if we fall through
				932	here... */
				933	}
				934
				935	/* Note: size will always be longer than the resulting Unicode
				936	character count */
				937	unicode = _PyUnicode_New(size);
				938	if (!unicode)
				939	return NULL;
				940	if (size == 0)
				941	return (PyObject *)unicode;
				942
				943	/* Unpack UTF-16 encoded data */
				944	p = unicode->str;
				945	q = (Py_UNICODE *)s;
				946	e = q + (size / sizeof(Py_UNICODE));
				947
				948	if (byteorder)
				949	bo = *byteorder;
				950
				951	while (q < e) {
				952	register Py_UNICODE ch = *q++;
				953
				954	/* Check for BOM marks (U+FEFF) in the input and adjust
				955	current byte order setting accordingly. Swap input
				956	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				957	!) */
				958	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				959	if (ch == 0xFEFF) {
				960	bo = -1;
				961	continue;
				962	} else if (ch == 0xFFFE) {
				963	bo = 1;
				964	continue;
				965	}
				966	if (bo == 1)
				967	ch = (ch >> 8) \| (ch << 8);
				968	#else
				969	if (ch == 0xFEFF) {
				970	bo = 1;
				971	continue;
				972	} else if (ch == 0xFFFE) {
				973	bo = -1;
				974	continue;
				975	}
				976	if (bo == -1)
				977	ch = (ch >> 8) \| (ch << 8);
				978	#endif
				979	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				980	*p++ = ch;
				981	continue;
				982	}
				983
				984	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	985	if (q >= e) {
				986	errmsg = "unexpected end of data";
				987	goto utf16Error;
				988	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	989	if (0xDC00 <= q && q <= 0xDFFF) {
				990	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	991	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	/* This is valid data (a UTF-16 surrogate pair), but
				993	we are not able to store this information since our
				994	Py_UNICODE type only has 16 bits... this might
				995	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	996	errmsg = "code pairs are not supported";
				997	goto utf16Error;
				998	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	else
				1000	continue;
				1001	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1002	errmsg = "illegal encoding";
				1003	/* Fall through to report the error */
				1004
				1005	utf16Error:
				1006	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1007	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008	}
				1009
				1010	if (byteorder)
				1011	*byteorder = bo;
				1012
				1013	/* Adjust length */
				1014	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1015	goto onError;
				1016
				1017	return (PyObject *)unicode;
				1018
				1019	onError:
				1020	Py_DECREF(unicode);
				1021	return NULL;
				1022	}
				1023
				1024	#undef UTF16_ERROR
				1025
				1026	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1027	int size,
				1028	const char *errors,
				1029	int byteorder)
				1030	{
				1031	PyObject *v;
				1032	Py_UNICODE *p;
				1033	char *q;
				1034
				1035	/* We don't create UTF-16 pairs... */
				1036	v = PyString_FromStringAndSize(NULL,
				1037	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1038	if (v == NULL)
				1039	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1040
				1041	q = PyString_AS_STRING(v);
				1042	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043	if (byteorder == 0)
				1044	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1045	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1046	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1047	if (byteorder == 0 \|\|
				1048	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1049	byteorder == -1
				1050	#else
				1051	byteorder == 1
				1052	#endif
				1053	)
				1054	memcpy(p, s, size * sizeof(Py_UNICODE));
				1055	else
				1056	while (size-- > 0) {
				1057	Py_UNICODE ch = *s++;
				1058	*p++ = (ch >> 8) \| (ch << 8);
				1059	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	return v;
				1061	}
				1062
				1063	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1064	{
				1065	if (!PyUnicode_Check(unicode)) {
				1066	PyErr_BadArgument();
				1067	return NULL;
				1068	}
				1069	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1070	PyUnicode_GET_SIZE(unicode),
				1071	NULL,
				1072	0);
				1073	}
				1074
				1075	/* --- Unicode Escape Codec ----------------------------------------------- */
				1076
				1077	static
				1078	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1079	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	const char *errors,
				1081	const char *details)
				1082	{
				1083	if ((errors == NULL) \|\|
				1084	(strcmp(errors,"strict") == 0)) {
				1085	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1086	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	details);
				1088	return -1;
				1089	}
				1090	else if (strcmp(errors,"ignore") == 0) {
				1091	return 0;
				1092	}
				1093	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1094	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1095	return 0;
				1096	}
				1097	else {
				1098	PyErr_Format(PyExc_ValueError,
				1099	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1100	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1101	errors);
				1102	return -1;
				1103	}
				1104	}
				1105
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1106	static _PyUnicode_Name_CAPI *unicode_names = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1107
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1109	int size,
				1110	const char *errors)
				1111	{
				1112	PyUnicodeObject *v;
				1113	Py_UNICODE p = NULL, buf = NULL;
				1114	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1115	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1116
				1117	/* Escaped strings will always be longer than the resulting
				1118	Unicode string, so we start with size here and then reduce the
				1119	length after conversion to the true value. */
				1120	v = _PyUnicode_New(size);
				1121	if (v == NULL)
				1122	goto onError;
				1123	if (size == 0)
				1124	return (PyObject *)v;
				1125	p = buf = PyUnicode_AS_UNICODE(v);
				1126	end = s + size;
				1127	while (s < end) {
				1128	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1129	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130	int i;
				1131
				1132	/* Non-escape characters are interpreted as Unicode ordinals */
				1133	if (*s != '\\') {
				1134	p++ = (unsigned char)s++;
				1135	continue;
				1136	}
				1137
				1138	/* \ - Escapes */
				1139	s++;
				1140	switch (*s++) {
				1141
				1142	/* \x escapes */
				1143	case '\n': break;
				1144	case '\\': *p++ = '\\'; break;
				1145	case '\'': *p++ = '\''; break;
				1146	case '\"': *p++ = '\"'; break;
				1147	case 'b': *p++ = '\b'; break;
				1148	case 'f': p++ = '\014'; break; / FF */
				1149	case 't': *p++ = '\t'; break;
				1150	case 'n': *p++ = '\n'; break;
				1151	case 'r': *p++ = '\r'; break;
				1152	case 'v': p++ = '\013'; break; / VT */
				1153	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1154
				1155	/* \OOO (octal) escapes */
				1156	case '0': case '1': case '2': case '3':
				1157	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1158	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1160	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1162	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1163	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1164	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1165	break;
				1166
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1167	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1168	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1169	for (x = 0, i = 0; i < 2; i++) {
				1170	c = (unsigned char)s[i];
				1171	if (!isxdigit(c)) {
				1172	if (unicodeescape_decoding_error(&s, &x, errors,
				1173	"truncated \\xXX"))
				1174	goto onError;
				1175	i++;
				1176	break;
				1177	}
				1178	x = (x<<4) & ~0xF;
				1179	if (c >= '0' && c <= '9')
				1180	x += c - '0';
				1181	else if (c >= 'a' && c <= 'f')
				1182	x += 10 + c - 'a';
				1183	else
				1184	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1185	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1186	s += i;
				1187	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1188	break;
				1189
				1190	/* \uXXXX with 4 hex digits */
				1191	case 'u':
				1192	for (x = 0, i = 0; i < 4; i++) {
				1193	c = (unsigned char)s[i];
				1194	if (!isxdigit(c)) {
				1195	if (unicodeescape_decoding_error(&s, &x, errors,
				1196	"truncated \\uXXXX"))
				1197	goto onError;
				1198	i++;
				1199	break;
				1200	}
				1201	x = (x<<4) & ~0xF;
				1202	if (c >= '0' && c <= '9')
				1203	x += c - '0';
				1204	else if (c >= 'a' && c <= 'f')
				1205	x += 10 + c - 'a';
				1206	else
				1207	x += 10 + c - 'A';
				1208	}
				1209	s += i;
				1210	*p++ = x;
				1211	break;
				1212
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1213	/* \UXXXXXXXX with 8 hex digits */
				1214	case 'U':
				1215	for (chr = 0, i = 0; i < 8; i++) {
				1216	c = (unsigned char)s[i];
				1217	if (!isxdigit(c)) {
				1218	if (unicodeescape_decoding_error(&s, &x, errors,
				1219	"truncated \\uXXXX"))
				1220	goto onError;
				1221	i++;
				1222	break;
				1223	}
				1224	chr = (chr<<4) & ~0xF;
				1225	if (c >= '0' && c <= '9')
				1226	chr += c - '0';
				1227	else if (c >= 'a' && c <= 'f')
				1228	chr += 10 + c - 'a';
				1229	else
				1230	chr += 10 + c - 'A';
				1231	}
				1232	s += i;
				1233	goto store;
				1234
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1235	case 'N':
				1236	/* Ok, we need to deal with Unicode Character Names now,
				1237	* make sure we've imported the hash table data...
				1238	*/
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1239	if (unicode_names == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1240	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1241	mod = PyImport_ImportModule("ucnhash");
				1242	if (mod == NULL)
				1243	goto onError;
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1244	v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1245	Py_DECREF(mod);
				1246	if (v == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1247	goto onError;
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1248	unicode_names = PyCObject_AsVoidPtr(v);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1249	Py_DECREF(v);
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1250	if (unicode_names == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1251	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1252	}
				1253
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1254	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1255	const char *start = s + 1;
				1256	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1257
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1258	/* look for the closing brace */
				1259	while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1260	endBrace++;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1261	if (endBrace != end && *endBrace == '}') {
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1262	if (!unicode_names->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1263	if (unicodeescape_decoding_error(
				1264	&s, &x, errors,
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1265	"Invalid Unicode Character Name")
				1266	)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1267	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1268	goto ucnFallthrough;
				1269	}
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1270	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1271	goto store;
				1272	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1273	if (unicodeescape_decoding_error(
				1274	&s, &x, errors,
				1275	"Unicode name missing closing brace"))
				1276	goto onError;
				1277	goto ucnFallthrough;
				1278	}
				1279	break;
				1280	}
				1281	if (unicodeescape_decoding_error(
				1282	&s, &x, errors,
				1283	"Missing opening brace for Unicode Character Name escape"))
				1284	goto onError;
				1285	ucnFallthrough:
				1286	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1287	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	*p++ = '\\';
				1289	*p++ = (unsigned char)s[-1];
				1290	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1291	store:
				1292	/* when we get here, chr is a 32-bit unicode character */
				1293	if (chr <= 0xffff)
				1294	/* UCS-2 character */
				1295	*p++ = (Py_UNICODE) chr;
				1296	else if (chr <= 0x10ffff) {
				1297	/* UCS-4 character. store as two surrogate characters */
				1298	chr -= 0x10000L;
				1299	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1300	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1301	} else {
				1302	if (unicodeescape_decoding_error(
				1303	&s, &x, errors,
				1304	"Illegal Unicode character")
				1305	)
				1306	goto onError;
				1307	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1308	}
				1309	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1310	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1311	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1312	return (PyObject *)v;
				1313
				1314	onError:
				1315	Py_XDECREF(v);
				1316	return NULL;
				1317	}
				1318
				1319	/* Return a Unicode-Escape string version of the Unicode object.
				1320
				1321	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1322	appropriate.
				1323
				1324	*/
				1325
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1326	static const Py_UNICODE findchar(const Py_UNICODE s,
				1327	int size,
				1328	Py_UNICODE ch);
				1329
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1330	static
				1331	PyObject unicodeescape_string(const Py_UNICODE s,
				1332	int size,
				1333	int quotes)
				1334	{
				1335	PyObject *repr;
				1336	char *p;
				1337	char *q;
				1338
				1339	static const char *hexdigit = "0123456789ABCDEF";
				1340
				1341	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1342	if (repr == NULL)
				1343	return NULL;
				1344
				1345	p = q = PyString_AS_STRING(repr);
				1346
				1347	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1348	*p++ = 'u';
				1349	*p++ = (findchar(s, size, '\'') &&
				1350	!findchar(s, size, '"')) ? '"' : '\'';
				1351	}
				1352	while (size-- > 0) {
				1353	Py_UNICODE ch = *s++;
				1354	/* Escape quotes */
				1355	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1356	*p++ = '\\';
				1357	*p++ = (char) ch;
				1358	}
				1359	/* Map 16-bit characters to '\uxxxx' */
				1360	else if (ch >= 256) {
				1361	*p++ = '\\';
				1362	*p++ = 'u';
				1363	*p++ = hexdigit[(ch >> 12) & 0xf];
				1364	*p++ = hexdigit[(ch >> 8) & 0xf];
				1365	*p++ = hexdigit[(ch >> 4) & 0xf];
				1366	*p++ = hexdigit[ch & 15];
				1367	}
				1368	/* Map non-printable US ASCII to '\ooo' */
				1369	else if (ch < ' ' \|\| ch >= 128) {
				1370	*p++ = '\\';
				1371	*p++ = hexdigit[(ch >> 6) & 7];
				1372	*p++ = hexdigit[(ch >> 3) & 7];
				1373	*p++ = hexdigit[ch & 7];
				1374	}
				1375	/* Copy everything else as-is */
				1376	else
				1377	*p++ = (char) ch;
				1378	}
				1379	if (quotes)
				1380	*p++ = q[1];
				1381
				1382	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1383	if (_PyString_Resize(&repr, p - q))
				1384	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1385
				1386	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1387
				1388	onError:
				1389	Py_DECREF(repr);
				1390	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1391	}
				1392
				1393	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1394	int size)
				1395	{
				1396	return unicodeescape_string(s, size, 0);
				1397	}
				1398
				1399	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1400	{
				1401	if (!PyUnicode_Check(unicode)) {
				1402	PyErr_BadArgument();
				1403	return NULL;
				1404	}
				1405	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1406	PyUnicode_GET_SIZE(unicode));
				1407	}
				1408
				1409	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1410
				1411	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1412	int size,
				1413	const char *errors)
				1414	{
				1415	PyUnicodeObject *v;
				1416	Py_UNICODE p, buf;
				1417	const char *end;
				1418	const char *bs;
				1419
				1420	/* Escaped strings will always be longer than the resulting
				1421	Unicode string, so we start with size here and then reduce the
				1422	length after conversion to the true value. */
				1423	v = _PyUnicode_New(size);
				1424	if (v == NULL)
				1425	goto onError;
				1426	if (size == 0)
				1427	return (PyObject *)v;
				1428	p = buf = PyUnicode_AS_UNICODE(v);
				1429	end = s + size;
				1430	while (s < end) {
				1431	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1432	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1433	int i;
				1434
				1435	/* Non-escape characters are interpreted as Unicode ordinals */
				1436	if (*s != '\\') {
				1437	p++ = (unsigned char)s++;
				1438	continue;
				1439	}
				1440
				1441	/* \u-escapes are only interpreted iff the number of leading
				1442	backslashes if odd */
				1443	bs = s;
				1444	for (;s < end;) {
				1445	if (*s != '\\')
				1446	break;
				1447	p++ = (unsigned char)s++;
				1448	}
				1449	if (((s - bs) & 1) == 0 \|\|
				1450	s >= end \|\|
				1451	*s != 'u') {
				1452	continue;
				1453	}
				1454	p--;
				1455	s++;
				1456
				1457	/* \uXXXX with 4 hex digits */
				1458	for (x = 0, i = 0; i < 4; i++) {
				1459	c = (unsigned char)s[i];
				1460	if (!isxdigit(c)) {
				1461	if (unicodeescape_decoding_error(&s, &x, errors,
				1462	"truncated \\uXXXX"))
				1463	goto onError;
				1464	i++;
				1465	break;
				1466	}
				1467	x = (x<<4) & ~0xF;
				1468	if (c >= '0' && c <= '9')
				1469	x += c - '0';
				1470	else if (c >= 'a' && c <= 'f')
				1471	x += 10 + c - 'a';
				1472	else
				1473	x += 10 + c - 'A';
				1474	}
				1475	s += i;
				1476	*p++ = x;
				1477	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1478	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1479	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1480	return (PyObject *)v;
				1481
				1482	onError:
				1483	Py_XDECREF(v);
				1484	return NULL;
				1485	}
				1486
				1487	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1488	int size)
				1489	{
				1490	PyObject *repr;
				1491	char *p;
				1492	char *q;
				1493
				1494	static const char *hexdigit = "0123456789ABCDEF";
				1495
				1496	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1497	if (repr == NULL)
				1498	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1499	if (size == 0)
				1500	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1501
				1502	p = q = PyString_AS_STRING(repr);
				1503	while (size-- > 0) {
				1504	Py_UNICODE ch = *s++;
				1505	/* Map 16-bit characters to '\uxxxx' */
				1506	if (ch >= 256) {
				1507	*p++ = '\\';
				1508	*p++ = 'u';
				1509	*p++ = hexdigit[(ch >> 12) & 0xf];
				1510	*p++ = hexdigit[(ch >> 8) & 0xf];
				1511	*p++ = hexdigit[(ch >> 4) & 0xf];
				1512	*p++ = hexdigit[ch & 15];
				1513	}
				1514	/* Copy everything else as-is */
				1515	else
				1516	*p++ = (char) ch;
				1517	}
				1518	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1519	if (_PyString_Resize(&repr, p - q))
				1520	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1521
				1522	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1523
				1524	onError:
				1525	Py_DECREF(repr);
				1526	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1527	}
				1528
				1529	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1530	{
				1531	if (!PyUnicode_Check(unicode)) {
				1532	PyErr_BadArgument();
				1533	return NULL;
				1534	}
				1535	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1536	PyUnicode_GET_SIZE(unicode));
				1537	}
				1538
				1539	/* --- Latin-1 Codec ------------------------------------------------------ */
				1540
				1541	PyObject PyUnicode_DecodeLatin1(const char s,
				1542	int size,
				1543	const char *errors)
				1544	{
				1545	PyUnicodeObject *v;
				1546	Py_UNICODE *p;
				1547
				1548	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1549	v = _PyUnicode_New(size);
				1550	if (v == NULL)
				1551	goto onError;
				1552	if (size == 0)
				1553	return (PyObject *)v;
				1554	p = PyUnicode_AS_UNICODE(v);
				1555	while (size-- > 0)
				1556	p++ = (unsigned char)s++;
				1557	return (PyObject *)v;
				1558
				1559	onError:
				1560	Py_XDECREF(v);
				1561	return NULL;
				1562	}
				1563
				1564	static
				1565	int latin1_encoding_error(const Py_UNICODE **source,
				1566	char **dest,
				1567	const char *errors,
				1568	const char *details)
				1569	{
				1570	if ((errors == NULL) \|\|
				1571	(strcmp(errors,"strict") == 0)) {
				1572	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1573	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1574	details);
				1575	return -1;
				1576	}
				1577	else if (strcmp(errors,"ignore") == 0) {
				1578	return 0;
				1579	}
				1580	else if (strcmp(errors,"replace") == 0) {
				1581	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1582	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1583	return 0;
				1584	}
				1585	else {
				1586	PyErr_Format(PyExc_ValueError,
				1587	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1588	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1589	errors);
				1590	return -1;
				1591	}
				1592	}
				1593
				1594	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1595	int size,
				1596	const char *errors)
				1597	{
				1598	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1599	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1600
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1601	repr = PyString_FromStringAndSize(NULL, size);
				1602	if (repr == NULL)
				1603	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1604	if (size == 0)
				1605	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1606
				1607	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1608	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1609	while (size-- > 0) {
				1610	Py_UNICODE ch = *p++;
				1611	if (ch >= 256) {
				1612	if (latin1_encoding_error(&p, &s, errors,
				1613	"ordinal not in range(256)"))
				1614	goto onError;
				1615	}
				1616	else
				1617	*s++ = (char)ch;
				1618	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1619	/* Resize if error handling skipped some characters */
				1620	if (s - start < PyString_GET_SIZE(repr))
				1621	if (_PyString_Resize(&repr, s - start))
				1622	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1623	return repr;
				1624
				1625	onError:
				1626	Py_DECREF(repr);
				1627	return NULL;
				1628	}
				1629
				1630	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1631	{
				1632	if (!PyUnicode_Check(unicode)) {
				1633	PyErr_BadArgument();
				1634	return NULL;
				1635	}
				1636	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1637	PyUnicode_GET_SIZE(unicode),
				1638	NULL);
				1639	}
				1640
				1641	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1642
				1643	static
				1644	int ascii_decoding_error(const char **source,
				1645	Py_UNICODE **dest,
				1646	const char *errors,
				1647	const char *details)
				1648	{
				1649	if ((errors == NULL) \|\|
				1650	(strcmp(errors,"strict") == 0)) {
				1651	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1652	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1653	details);
				1654	return -1;
				1655	}
				1656	else if (strcmp(errors,"ignore") == 0) {
				1657	return 0;
				1658	}
				1659	else if (strcmp(errors,"replace") == 0) {
				1660	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1661	(*dest)++;
				1662	return 0;
				1663	}
				1664	else {
				1665	PyErr_Format(PyExc_ValueError,
				1666	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1667	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1668	errors);
				1669	return -1;
				1670	}
				1671	}
				1672
				1673	PyObject PyUnicode_DecodeASCII(const char s,
				1674	int size,
				1675	const char *errors)
				1676	{
				1677	PyUnicodeObject *v;
				1678	Py_UNICODE *p;
				1679
				1680	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1681	v = _PyUnicode_New(size);
				1682	if (v == NULL)
				1683	goto onError;
				1684	if (size == 0)
				1685	return (PyObject *)v;
				1686	p = PyUnicode_AS_UNICODE(v);
				1687	while (size-- > 0) {
				1688	register unsigned char c;
				1689
				1690	c = (unsigned char)*s++;
				1691	if (c < 128)
				1692	*p++ = c;
				1693	else if (ascii_decoding_error(&s, &p, errors,
				1694	"ordinal not in range(128)"))
				1695	goto onError;
				1696	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1697	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1698	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1699	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1700	return (PyObject *)v;
				1701
				1702	onError:
				1703	Py_XDECREF(v);
				1704	return NULL;
				1705	}
				1706
				1707	static
				1708	int ascii_encoding_error(const Py_UNICODE **source,
				1709	char **dest,
				1710	const char *errors,
				1711	const char *details)
				1712	{
				1713	if ((errors == NULL) \|\|
				1714	(strcmp(errors,"strict") == 0)) {
				1715	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1716	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1717	details);
				1718	return -1;
				1719	}
				1720	else if (strcmp(errors,"ignore") == 0) {
				1721	return 0;
				1722	}
				1723	else if (strcmp(errors,"replace") == 0) {
				1724	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1725	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1726	return 0;
				1727	}
				1728	else {
				1729	PyErr_Format(PyExc_ValueError,
				1730	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1731	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1732	errors);
				1733	return -1;
				1734	}
				1735	}
				1736
				1737	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1738	int size,
				1739	const char *errors)
				1740	{
				1741	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1742	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1743
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1744	repr = PyString_FromStringAndSize(NULL, size);
				1745	if (repr == NULL)
				1746	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1747	if (size == 0)
				1748	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1749
				1750	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1751	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1752	while (size-- > 0) {
				1753	Py_UNICODE ch = *p++;
				1754	if (ch >= 128) {
				1755	if (ascii_encoding_error(&p, &s, errors,
				1756	"ordinal not in range(128)"))
				1757	goto onError;
				1758	}
				1759	else
				1760	*s++ = (char)ch;
				1761	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1762	/* Resize if error handling skipped some characters */
				1763	if (s - start < PyString_GET_SIZE(repr))
				1764	if (_PyString_Resize(&repr, s - start))
				1765	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1766	return repr;
				1767
				1768	onError:
				1769	Py_DECREF(repr);
				1770	return NULL;
				1771	}
				1772
				1773	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1774	{
				1775	if (!PyUnicode_Check(unicode)) {
				1776	PyErr_BadArgument();
				1777	return NULL;
				1778	}
				1779	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1780	PyUnicode_GET_SIZE(unicode),
				1781	NULL);
				1782	}
				1783
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1784	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1785
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1786	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1787
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1788	PyObject PyUnicode_DecodeMBCS(const char s,
				1789	int size,
				1790	const char *errors)
				1791	{
				1792	PyUnicodeObject *v;
				1793	Py_UNICODE *p;
				1794
				1795	/* First get the size of the result */
				1796	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1797	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1798	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1799
				1800	v = _PyUnicode_New(usize);
				1801	if (v == NULL)
				1802	return NULL;
				1803	if (usize == 0)
				1804	return (PyObject *)v;
				1805	p = PyUnicode_AS_UNICODE(v);
				1806	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1807	Py_DECREF(v);
				1808	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1809	}
				1810
				1811	return (PyObject *)v;
				1812	}
				1813
				1814	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1815	int size,
				1816	const char *errors)
				1817	{
				1818	PyObject *repr;
				1819	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1820	DWORD mbcssize;
				1821
				1822	/* If there are no characters, bail now! */
				1823	if (size==0)
				1824	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1825
				1826	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1827	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1828	if (mbcssize==0)
				1829	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1830
				1831	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1832	if (repr == NULL)
				1833	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1834	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1835	return repr;
				1836
				1837	/* Do the conversion */
				1838	s = PyString_AS_STRING(repr);
				1839	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1840	Py_DECREF(repr);
				1841	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1842	}
				1843	return repr;
				1844	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1845
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1846	#endif /* MS_WIN32 */
				1847
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1848	/* --- Character Mapping Codec -------------------------------------------- */
				1849
				1850	static
				1851	int charmap_decoding_error(const char **source,
				1852	Py_UNICODE **dest,
				1853	const char *errors,
				1854	const char *details)
				1855	{
				1856	if ((errors == NULL) \|\|
				1857	(strcmp(errors,"strict") == 0)) {
				1858	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1859	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1860	details);
				1861	return -1;
				1862	}
				1863	else if (strcmp(errors,"ignore") == 0) {
				1864	return 0;
				1865	}
				1866	else if (strcmp(errors,"replace") == 0) {
				1867	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1868	(*dest)++;
				1869	return 0;
				1870	}
				1871	else {
				1872	PyErr_Format(PyExc_ValueError,
				1873	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1874	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1875	errors);
				1876	return -1;
				1877	}
				1878	}
				1879
				1880	PyObject PyUnicode_DecodeCharmap(const char s,
				1881	int size,
				1882	PyObject *mapping,
				1883	const char *errors)
				1884	{
				1885	PyUnicodeObject *v;
				1886	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1887	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1888
				1889	/* Default to Latin-1 */
				1890	if (mapping == NULL)
				1891	return PyUnicode_DecodeLatin1(s, size, errors);
				1892
				1893	v = _PyUnicode_New(size);
				1894	if (v == NULL)
				1895	goto onError;
				1896	if (size == 0)
				1897	return (PyObject *)v;
				1898	p = PyUnicode_AS_UNICODE(v);
				1899	while (size-- > 0) {
				1900	unsigned char ch = *s++;
				1901	PyObject w, x;
				1902
				1903	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1904	w = PyInt_FromLong((long)ch);
				1905	if (w == NULL)
				1906	goto onError;
				1907	x = PyObject_GetItem(mapping, w);
				1908	Py_DECREF(w);
				1909	if (x == NULL) {
				1910	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1911	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1912	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1913	x = Py_None;
				1914	Py_INCREF(x);
				1915	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1916	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1917	}
				1918
				1919	/* Apply mapping */
				1920	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1921	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1922	if (value < 0 \|\| value > 65535) {
				1923	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1924	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1925	Py_DECREF(x);
				1926	goto onError;
				1927	}
				1928	*p++ = (Py_UNICODE)value;
				1929	}
				1930	else if (x == Py_None) {
				1931	/* undefined mapping */
				1932	if (charmap_decoding_error(&s, &p, errors,
				1933	"character maps to <undefined>")) {
				1934	Py_DECREF(x);
				1935	goto onError;
				1936	}
				1937	}
				1938	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1939	int targetsize = PyUnicode_GET_SIZE(x);
				1940
				1941	if (targetsize == 1)
				1942	/* 1-1 mapping */
				1943	p++ = PyUnicode_AS_UNICODE(x);
				1944
				1945	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1946	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1947	if (targetsize > extrachars) {
				1948	/* resize first */
				1949	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				1950	int needed = (targetsize - extrachars) + \
				1951	(targetsize << 2);
				1952	extrachars += needed;
				1953	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1954	Py_DECREF(x);
				1955	goto onError;
				1956	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1957	p = PyUnicode_AS_UNICODE(v) + oldpos;
				1958	}
				1959	Py_UNICODE_COPY(p,
				1960	PyUnicode_AS_UNICODE(x),
				1961	targetsize);
				1962	p += targetsize;
				1963	extrachars -= targetsize;
				1964	}
				1965	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1966	}
				1967	else {
				1968	/* wrong return value */
				1969	PyErr_SetString(PyExc_TypeError,
				1970	"character mapping must return integer, None or unicode");
				1971	Py_DECREF(x);
				1972	goto onError;
				1973	}
				1974	Py_DECREF(x);
				1975	}
				1976	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1977	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1978	goto onError;
				1979	return (PyObject *)v;
				1980
				1981	onError:
				1982	Py_XDECREF(v);
				1983	return NULL;
				1984	}
				1985
				1986	static
				1987	int charmap_encoding_error(const Py_UNICODE **source,
				1988	char **dest,
				1989	const char *errors,
				1990	const char *details)
				1991	{
				1992	if ((errors == NULL) \|\|
				1993	(strcmp(errors,"strict") == 0)) {
				1994	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1995	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1996	details);
				1997	return -1;
				1998	}
				1999	else if (strcmp(errors,"ignore") == 0) {
				2000	return 0;
				2001	}
				2002	else if (strcmp(errors,"replace") == 0) {
				2003	**dest = '?';
				2004	(*dest)++;
				2005	return 0;
				2006	}
				2007	else {
				2008	PyErr_Format(PyExc_ValueError,
				2009	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2010	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2011	errors);
				2012	return -1;
				2013	}
				2014	}
				2015
				2016	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2017	int size,
				2018	PyObject *mapping,
				2019	const char *errors)
				2020	{
				2021	PyObject *v;
				2022	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2023	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2024
				2025	/* Default to Latin-1 */
				2026	if (mapping == NULL)
				2027	return PyUnicode_EncodeLatin1(p, size, errors);
				2028
				2029	v = PyString_FromStringAndSize(NULL, size);
				2030	if (v == NULL)
				2031	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2032	if (size == 0)
				2033	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2034	s = PyString_AS_STRING(v);
				2035	while (size-- > 0) {
				2036	Py_UNICODE ch = *p++;
				2037	PyObject w, x;
				2038
				2039	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2040	w = PyInt_FromLong((long)ch);
				2041	if (w == NULL)
				2042	goto onError;
				2043	x = PyObject_GetItem(mapping, w);
				2044	Py_DECREF(w);
				2045	if (x == NULL) {
				2046	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2047	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2048	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2049	x = Py_None;
				2050	Py_INCREF(x);
				2051	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2052	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2053	}
				2054
				2055	/* Apply mapping */
				2056	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2057	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2058	if (value < 0 \|\| value > 255) {
				2059	PyErr_SetString(PyExc_TypeError,
				2060	"character mapping must be in range(256)");
				2061	Py_DECREF(x);
				2062	goto onError;
				2063	}
				2064	*s++ = (char)value;
				2065	}
				2066	else if (x == Py_None) {
				2067	/* undefined mapping */
				2068	if (charmap_encoding_error(&p, &s, errors,
				2069	"character maps to <undefined>")) {
				2070	Py_DECREF(x);
				2071	goto onError;
				2072	}
				2073	}
				2074	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2075	int targetsize = PyString_GET_SIZE(x);
				2076
				2077	if (targetsize == 1)
				2078	/* 1-1 mapping */
				2079	s++ = PyString_AS_STRING(x);
				2080
				2081	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2082	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2083	if (targetsize > extrachars) {
				2084	/* resize first */
				2085	int oldpos = (int)(s - PyString_AS_STRING(v));
				2086	int needed = (targetsize - extrachars) + \
				2087	(targetsize << 2);
				2088	extrachars += needed;
				2089	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2090	Py_DECREF(x);
				2091	goto onError;
				2092	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2093	s = PyString_AS_STRING(v) + oldpos;
				2094	}
				2095	memcpy(s,
				2096	PyString_AS_STRING(x),
				2097	targetsize);
				2098	s += targetsize;
				2099	extrachars -= targetsize;
				2100	}
				2101	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2102	}
				2103	else {
				2104	/* wrong return value */
				2105	PyErr_SetString(PyExc_TypeError,
				2106	"character mapping must return integer, None or unicode");
				2107	Py_DECREF(x);
				2108	goto onError;
				2109	}
				2110	Py_DECREF(x);
				2111	}
				2112	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2113	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2114	goto onError;
				2115	return v;
				2116
				2117	onError:
				2118	Py_DECREF(v);
				2119	return NULL;
				2120	}
				2121
				2122	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2123	PyObject *mapping)
				2124	{
				2125	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2126	PyErr_BadArgument();
				2127	return NULL;
				2128	}
				2129	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2130	PyUnicode_GET_SIZE(unicode),
				2131	mapping,
				2132	NULL);
				2133	}
				2134
				2135	static
				2136	int translate_error(const Py_UNICODE **source,
				2137	Py_UNICODE **dest,
				2138	const char *errors,
				2139	const char *details)
				2140	{
				2141	if ((errors == NULL) \|\|
				2142	(strcmp(errors,"strict") == 0)) {
				2143	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2144	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2145	details);
				2146	return -1;
				2147	}
				2148	else if (strcmp(errors,"ignore") == 0) {
				2149	return 0;
				2150	}
				2151	else if (strcmp(errors,"replace") == 0) {
				2152	**dest = '?';
				2153	(*dest)++;
				2154	return 0;
				2155	}
				2156	else {
				2157	PyErr_Format(PyExc_ValueError,
				2158	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2159	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2160	errors);
				2161	return -1;
				2162	}
				2163	}
				2164
				2165	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2166	int size,
				2167	PyObject *mapping,
				2168	const char *errors)
				2169	{
				2170	PyUnicodeObject *v;
				2171	Py_UNICODE *p;
				2172
				2173	if (mapping == NULL) {
				2174	PyErr_BadArgument();
				2175	return NULL;
				2176	}
				2177
				2178	/* Output will never be longer than input */
				2179	v = _PyUnicode_New(size);
				2180	if (v == NULL)
				2181	goto onError;
				2182	if (size == 0)
				2183	goto done;
				2184	p = PyUnicode_AS_UNICODE(v);
				2185	while (size-- > 0) {
				2186	Py_UNICODE ch = *s++;
				2187	PyObject w, x;
				2188
				2189	/* Get mapping */
				2190	w = PyInt_FromLong(ch);
				2191	if (w == NULL)
				2192	goto onError;
				2193	x = PyObject_GetItem(mapping, w);
				2194	Py_DECREF(w);
				2195	if (x == NULL) {
				2196	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2197	/* No mapping found: default to 1-1 mapping */
				2198	PyErr_Clear();
				2199	*p++ = ch;
				2200	continue;
				2201	}
				2202	goto onError;
				2203	}
				2204
				2205	/* Apply mapping */
				2206	if (PyInt_Check(x))
				2207	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2208	else if (x == Py_None) {
				2209	/* undefined mapping */
				2210	if (translate_error(&s, &p, errors,
				2211	"character maps to <undefined>")) {
				2212	Py_DECREF(x);
				2213	goto onError;
				2214	}
				2215	}
				2216	else if (PyUnicode_Check(x)) {
				2217	if (PyUnicode_GET_SIZE(x) != 1) {
				2218	/* 1-n mapping */
				2219	PyErr_SetString(PyExc_NotImplementedError,
				2220	"1-n mappings are currently not implemented");
				2221	Py_DECREF(x);
				2222	goto onError;
				2223	}
				2224	p++ = PyUnicode_AS_UNICODE(x);
				2225	}
				2226	else {
				2227	/* wrong return value */
				2228	PyErr_SetString(PyExc_TypeError,
				2229	"translate mapping must return integer, None or unicode");
				2230	Py_DECREF(x);
				2231	goto onError;
				2232	}
				2233	Py_DECREF(x);
				2234	}
				2235	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2236	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2237	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2238
				2239	done:
				2240	return (PyObject *)v;
				2241
				2242	onError:
				2243	Py_XDECREF(v);
				2244	return NULL;
				2245	}
				2246
				2247	PyObject PyUnicode_Translate(PyObject str,
				2248	PyObject *mapping,
				2249	const char *errors)
				2250	{
				2251	PyObject *result;
				2252
				2253	str = PyUnicode_FromObject(str);
				2254	if (str == NULL)
				2255	goto onError;
				2256	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2257	PyUnicode_GET_SIZE(str),
				2258	mapping,
				2259	errors);
				2260	Py_DECREF(str);
				2261	return result;
				2262
				2263	onError:
				2264	Py_XDECREF(str);
				2265	return NULL;
				2266	}
				2267
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2268	/* --- Decimal Encoder ---------------------------------------------------- */
				2269
				2270	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2271	int length,
				2272	char *output,
				2273	const char *errors)
				2274	{
				2275	Py_UNICODE p, end;
				2276
				2277	if (output == NULL) {
				2278	PyErr_BadArgument();
				2279	return -1;
				2280	}
				2281
				2282	p = s;
				2283	end = s + length;
				2284	while (p < end) {
				2285	register Py_UNICODE ch = *p++;
				2286	int decimal;
				2287
				2288	if (Py_UNICODE_ISSPACE(ch)) {
				2289	*output++ = ' ';
				2290	continue;
				2291	}
				2292	decimal = Py_UNICODE_TODECIMAL(ch);
				2293	if (decimal >= 0) {
				2294	*output++ = '0' + decimal;
				2295	continue;
				2296	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2297	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2298	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2299	continue;
				2300	}
				2301	/* All other characters are considered invalid */
				2302	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2303	PyErr_SetString(PyExc_ValueError,
				2304	"invalid decimal Unicode string");
				2305	goto onError;
				2306	}
				2307	else if (strcmp(errors, "ignore") == 0)
				2308	continue;
				2309	else if (strcmp(errors, "replace") == 0) {
				2310	*output++ = '?';
				2311	continue;
				2312	}
				2313	}
				2314	/* 0-terminate the output string */
				2315	*output++ = '\0';
				2316	return 0;
				2317
				2318	onError:
				2319	return -1;
				2320	}
				2321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2322	/* --- Helpers ------------------------------------------------------------ */
				2323
				2324	static
				2325	int count(PyUnicodeObject *self,
				2326	int start,
				2327	int end,
				2328	PyUnicodeObject *substring)
				2329	{
				2330	int count = 0;
				2331
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2332	if (start < 0)
				2333	start += self->length;
				2334	if (start < 0)
				2335	start = 0;
				2336	if (end > self->length)
				2337	end = self->length;
				2338	if (end < 0)
				2339	end += self->length;
				2340	if (end < 0)
				2341	end = 0;
				2342
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2343	if (substring->length == 0)
				2344	return (end - start + 1);
				2345
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2346	end -= substring->length;
				2347
				2348	while (start <= end)
				2349	if (Py_UNICODE_MATCH(self, start, substring)) {
				2350	count++;
				2351	start += substring->length;
				2352	} else
				2353	start++;
				2354
				2355	return count;
				2356	}
				2357
				2358	int PyUnicode_Count(PyObject *str,
				2359	PyObject *substr,
				2360	int start,
				2361	int end)
				2362	{
				2363	int result;
				2364
				2365	str = PyUnicode_FromObject(str);
				2366	if (str == NULL)
				2367	return -1;
				2368	substr = PyUnicode_FromObject(substr);
				2369	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2370	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2371	return -1;
				2372	}
				2373
				2374	result = count((PyUnicodeObject *)str,
				2375	start, end,
				2376	(PyUnicodeObject *)substr);
				2377
				2378	Py_DECREF(str);
				2379	Py_DECREF(substr);
				2380	return result;
				2381	}
				2382
				2383	static
				2384	int findstring(PyUnicodeObject *self,
				2385	PyUnicodeObject *substring,
				2386	int start,
				2387	int end,
				2388	int direction)
				2389	{
				2390	if (start < 0)
				2391	start += self->length;
				2392	if (start < 0)
				2393	start = 0;
				2394
				2395	if (substring->length == 0)
				2396	return start;
				2397
				2398	if (end > self->length)
				2399	end = self->length;
				2400	if (end < 0)
				2401	end += self->length;
				2402	if (end < 0)
				2403	end = 0;
				2404
				2405	end -= substring->length;
				2406
				2407	if (direction < 0) {
				2408	for (; end >= start; end--)
				2409	if (Py_UNICODE_MATCH(self, end, substring))
				2410	return end;
				2411	} else {
				2412	for (; start <= end; start++)
				2413	if (Py_UNICODE_MATCH(self, start, substring))
				2414	return start;
				2415	}
				2416
				2417	return -1;
				2418	}
				2419
				2420	int PyUnicode_Find(PyObject *str,
				2421	PyObject *substr,
				2422	int start,
				2423	int end,
				2424	int direction)
				2425	{
				2426	int result;
				2427
				2428	str = PyUnicode_FromObject(str);
				2429	if (str == NULL)
				2430	return -1;
				2431	substr = PyUnicode_FromObject(substr);
				2432	if (substr == NULL) {
				2433	Py_DECREF(substr);
				2434	return -1;
				2435	}
				2436
				2437	result = findstring((PyUnicodeObject *)str,
				2438	(PyUnicodeObject *)substr,
				2439	start, end, direction);
				2440	Py_DECREF(str);
				2441	Py_DECREF(substr);
				2442	return result;
				2443	}
				2444
				2445	static
				2446	int tailmatch(PyUnicodeObject *self,
				2447	PyUnicodeObject *substring,
				2448	int start,
				2449	int end,
				2450	int direction)
				2451	{
				2452	if (start < 0)
				2453	start += self->length;
				2454	if (start < 0)
				2455	start = 0;
				2456
				2457	if (substring->length == 0)
				2458	return 1;
				2459
				2460	if (end > self->length)
				2461	end = self->length;
				2462	if (end < 0)
				2463	end += self->length;
				2464	if (end < 0)
				2465	end = 0;
				2466
				2467	end -= substring->length;
				2468	if (end < start)
				2469	return 0;
				2470
				2471	if (direction > 0) {
				2472	if (Py_UNICODE_MATCH(self, end, substring))
				2473	return 1;
				2474	} else {
				2475	if (Py_UNICODE_MATCH(self, start, substring))
				2476	return 1;
				2477	}
				2478
				2479	return 0;
				2480	}
				2481
				2482	int PyUnicode_Tailmatch(PyObject *str,
				2483	PyObject *substr,
				2484	int start,
				2485	int end,
				2486	int direction)
				2487	{
				2488	int result;
				2489
				2490	str = PyUnicode_FromObject(str);
				2491	if (str == NULL)
				2492	return -1;
				2493	substr = PyUnicode_FromObject(substr);
				2494	if (substr == NULL) {
				2495	Py_DECREF(substr);
				2496	return -1;
				2497	}
				2498
				2499	result = tailmatch((PyUnicodeObject *)str,
				2500	(PyUnicodeObject *)substr,
				2501	start, end, direction);
				2502	Py_DECREF(str);
				2503	Py_DECREF(substr);
				2504	return result;
				2505	}
				2506
				2507	static
				2508	const Py_UNICODE findchar(const Py_UNICODE s,
				2509	int size,
				2510	Py_UNICODE ch)
				2511	{
				2512	/* like wcschr, but doesn't stop at NULL characters */
				2513
				2514	while (size-- > 0) {
				2515	if (*s == ch)
				2516	return s;
				2517	s++;
				2518	}
				2519
				2520	return NULL;
				2521	}
				2522
				2523	/* Apply fixfct filter to the Unicode object self and return a
				2524	reference to the modified object */
				2525
				2526	static
				2527	PyObject fixup(PyUnicodeObject self,
				2528	int (fixfct)(PyUnicodeObject s))
				2529	{
				2530
				2531	PyUnicodeObject *u;
				2532
				2533	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2534	self->length);
				2535	if (u == NULL)
				2536	return NULL;
				2537	if (!fixfct(u)) {
				2538	/* fixfct should return TRUE if it modified the buffer. If
				2539	FALSE, return a reference to the original buffer instead
				2540	(to save space, not time) */
				2541	Py_INCREF(self);
				2542	Py_DECREF(u);
				2543	return (PyObject*) self;
				2544	}
				2545	return (PyObject*) u;
				2546	}
				2547
				2548	static
				2549	int fixupper(PyUnicodeObject *self)
				2550	{
				2551	int len = self->length;
				2552	Py_UNICODE *s = self->str;
				2553	int status = 0;
				2554
				2555	while (len-- > 0) {
				2556	register Py_UNICODE ch;
				2557
				2558	ch = Py_UNICODE_TOUPPER(*s);
				2559	if (ch != *s) {
				2560	status = 1;
				2561	*s = ch;
				2562	}
				2563	s++;
				2564	}
				2565
				2566	return status;
				2567	}
				2568
				2569	static
				2570	int fixlower(PyUnicodeObject *self)
				2571	{
				2572	int len = self->length;
				2573	Py_UNICODE *s = self->str;
				2574	int status = 0;
				2575
				2576	while (len-- > 0) {
				2577	register Py_UNICODE ch;
				2578
				2579	ch = Py_UNICODE_TOLOWER(*s);
				2580	if (ch != *s) {
				2581	status = 1;
				2582	*s = ch;
				2583	}
				2584	s++;
				2585	}
				2586
				2587	return status;
				2588	}
				2589
				2590	static
				2591	int fixswapcase(PyUnicodeObject *self)
				2592	{
				2593	int len = self->length;
				2594	Py_UNICODE *s = self->str;
				2595	int status = 0;
				2596
				2597	while (len-- > 0) {
				2598	if (Py_UNICODE_ISUPPER(*s)) {
				2599	s = Py_UNICODE_TOLOWER(s);
				2600	status = 1;
				2601	} else if (Py_UNICODE_ISLOWER(*s)) {
				2602	s = Py_UNICODE_TOUPPER(s);
				2603	status = 1;
				2604	}
				2605	s++;
				2606	}
				2607
				2608	return status;
				2609	}
				2610
				2611	static
				2612	int fixcapitalize(PyUnicodeObject *self)
				2613	{
				2614	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2615	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2616	return 1;
				2617	}
				2618	return 0;
				2619	}
				2620
				2621	static
				2622	int fixtitle(PyUnicodeObject *self)
				2623	{
				2624	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2625	register Py_UNICODE *e;
				2626	int previous_is_cased;
				2627
				2628	/* Shortcut for single character strings */
				2629	if (PyUnicode_GET_SIZE(self) == 1) {
				2630	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2631	if (*p != ch) {
				2632	*p = ch;
				2633	return 1;
				2634	}
				2635	else
				2636	return 0;
				2637	}
				2638
				2639	e = p + PyUnicode_GET_SIZE(self);
				2640	previous_is_cased = 0;
				2641	for (; p < e; p++) {
				2642	register const Py_UNICODE ch = *p;
				2643
				2644	if (previous_is_cased)
				2645	*p = Py_UNICODE_TOLOWER(ch);
				2646	else
				2647	*p = Py_UNICODE_TOTITLE(ch);
				2648
				2649	if (Py_UNICODE_ISLOWER(ch) \|\|
				2650	Py_UNICODE_ISUPPER(ch) \|\|
				2651	Py_UNICODE_ISTITLE(ch))
				2652	previous_is_cased = 1;
				2653	else
				2654	previous_is_cased = 0;
				2655	}
				2656	return 1;
				2657	}
				2658
				2659	PyObject PyUnicode_Join(PyObject separator,
				2660	PyObject *seq)
				2661	{
				2662	Py_UNICODE *sep;
				2663	int seplen;
				2664	PyUnicodeObject *res = NULL;
				2665	int reslen = 0;
				2666	Py_UNICODE *p;
				2667	int seqlen = 0;
				2668	int sz = 100;
				2669	int i;
				2670
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2671	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2672	if (seqlen < 0 && PyErr_Occurred())
				2673	return NULL;
				2674
				2675	if (separator == NULL) {
				2676	Py_UNICODE blank = ' ';
				2677	sep = &blank;
				2678	seplen = 1;
				2679	}
				2680	else {
				2681	separator = PyUnicode_FromObject(separator);
				2682	if (separator == NULL)
				2683	return NULL;
				2684	sep = PyUnicode_AS_UNICODE(separator);
				2685	seplen = PyUnicode_GET_SIZE(separator);
				2686	}
				2687
				2688	res = _PyUnicode_New(sz);
				2689	if (res == NULL)
				2690	goto onError;
				2691	p = PyUnicode_AS_UNICODE(res);
				2692	reslen = 0;
				2693
				2694	for (i = 0; i < seqlen; i++) {
				2695	int itemlen;
				2696	PyObject *item;
				2697
				2698	item = PySequence_GetItem(seq, i);
				2699	if (item == NULL)
				2700	goto onError;
				2701	if (!PyUnicode_Check(item)) {
				2702	PyObject *v;
				2703	v = PyUnicode_FromObject(item);
				2704	Py_DECREF(item);
				2705	item = v;
				2706	if (item == NULL)
				2707	goto onError;
				2708	}
				2709	itemlen = PyUnicode_GET_SIZE(item);
				2710	while (reslen + itemlen + seplen >= sz) {
				2711	if (_PyUnicode_Resize(res, sz*2))
				2712	goto onError;
				2713	sz *= 2;
				2714	p = PyUnicode_AS_UNICODE(res) + reslen;
				2715	}
				2716	if (i > 0) {
				2717	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2718	p += seplen;
				2719	reslen += seplen;
				2720	}
				2721	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2722	p += itemlen;
				2723	reslen += itemlen;
				2724	Py_DECREF(item);
				2725	}
				2726	if (_PyUnicode_Resize(res, reslen))
				2727	goto onError;
				2728
				2729	Py_XDECREF(separator);
				2730	return (PyObject *)res;
				2731
				2732	onError:
				2733	Py_XDECREF(separator);
				2734	Py_DECREF(res);
				2735	return NULL;
				2736	}
				2737
				2738	static
				2739	PyUnicodeObject pad(PyUnicodeObject self,
				2740	int left,
				2741	int right,
				2742	Py_UNICODE fill)
				2743	{
				2744	PyUnicodeObject *u;
				2745
				2746	if (left < 0)
				2747	left = 0;
				2748	if (right < 0)
				2749	right = 0;
				2750
				2751	if (left == 0 && right == 0) {
				2752	Py_INCREF(self);
				2753	return self;
				2754	}
				2755
				2756	u = _PyUnicode_New(left + self->length + right);
				2757	if (u) {
				2758	if (left)
				2759	Py_UNICODE_FILL(u->str, fill, left);
				2760	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2761	if (right)
				2762	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2763	}
				2764
				2765	return u;
				2766	}
				2767
				2768	#define SPLIT_APPEND(data, left, right) \
				2769	str = PyUnicode_FromUnicode(data + left, right - left); \
				2770	if (!str) \
				2771	goto onError; \
				2772	if (PyList_Append(list, str)) { \
				2773	Py_DECREF(str); \
				2774	goto onError; \
				2775	} \
				2776	else \
				2777	Py_DECREF(str);
				2778
				2779	static
				2780	PyObject split_whitespace(PyUnicodeObject self,
				2781	PyObject *list,
				2782	int maxcount)
				2783	{
				2784	register int i;
				2785	register int j;
				2786	int len = self->length;
				2787	PyObject *str;
				2788
				2789	for (i = j = 0; i < len; ) {
				2790	/* find a token */
				2791	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2792	i++;
				2793	j = i;
				2794	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2795	i++;
				2796	if (j < i) {
				2797	if (maxcount-- <= 0)
				2798	break;
				2799	SPLIT_APPEND(self->str, j, i);
				2800	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2801	i++;
				2802	j = i;
				2803	}
				2804	}
				2805	if (j < len) {
				2806	SPLIT_APPEND(self->str, j, len);
				2807	}
				2808	return list;
				2809
				2810	onError:
				2811	Py_DECREF(list);
				2812	return NULL;
				2813	}
				2814
				2815	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2816	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2817	{
				2818	register int i;
				2819	register int j;
				2820	int len;
				2821	PyObject *list;
				2822	PyObject *str;
				2823	Py_UNICODE *data;
				2824
				2825	string = PyUnicode_FromObject(string);
				2826	if (string == NULL)
				2827	return NULL;
				2828	data = PyUnicode_AS_UNICODE(string);
				2829	len = PyUnicode_GET_SIZE(string);
				2830
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2831	list = PyList_New(0);
				2832	if (!list)
				2833	goto onError;
				2834
				2835	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2836	int eol;
				2837
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2838	/* Find a line and append it */
				2839	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2840	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2841
				2842	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2843	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2844	if (i < len) {
				2845	if (data[i] == '\r' && i + 1 < len &&
				2846	data[i+1] == '\n')
				2847	i += 2;
				2848	else
				2849	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2850	if (keepends)
				2851	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2852	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2853	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2854	j = i;
				2855	}
				2856	if (j < len) {
				2857	SPLIT_APPEND(data, j, len);
				2858	}
				2859
				2860	Py_DECREF(string);
				2861	return list;
				2862
				2863	onError:
				2864	Py_DECREF(list);
				2865	Py_DECREF(string);
				2866	return NULL;
				2867	}
				2868
				2869	static
				2870	PyObject split_char(PyUnicodeObject self,
				2871	PyObject *list,
				2872	Py_UNICODE ch,
				2873	int maxcount)
				2874	{
				2875	register int i;
				2876	register int j;
				2877	int len = self->length;
				2878	PyObject *str;
				2879
				2880	for (i = j = 0; i < len; ) {
				2881	if (self->str[i] == ch) {
				2882	if (maxcount-- <= 0)
				2883	break;
				2884	SPLIT_APPEND(self->str, j, i);
				2885	i = j = i + 1;
				2886	} else
				2887	i++;
				2888	}
				2889	if (j <= len) {
				2890	SPLIT_APPEND(self->str, j, len);
				2891	}
				2892	return list;
				2893
				2894	onError:
				2895	Py_DECREF(list);
				2896	return NULL;
				2897	}
				2898
				2899	static
				2900	PyObject split_substring(PyUnicodeObject self,
				2901	PyObject *list,
				2902	PyUnicodeObject *substring,
				2903	int maxcount)
				2904	{
				2905	register int i;
				2906	register int j;
				2907	int len = self->length;
				2908	int sublen = substring->length;
				2909	PyObject *str;
				2910
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2911	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2912	if (Py_UNICODE_MATCH(self, i, substring)) {
				2913	if (maxcount-- <= 0)
				2914	break;
				2915	SPLIT_APPEND(self->str, j, i);
				2916	i = j = i + sublen;
				2917	} else
				2918	i++;
				2919	}
				2920	if (j <= len) {
				2921	SPLIT_APPEND(self->str, j, len);
				2922	}
				2923	return list;
				2924
				2925	onError:
				2926	Py_DECREF(list);
				2927	return NULL;
				2928	}
				2929
				2930	#undef SPLIT_APPEND
				2931
				2932	static
				2933	PyObject split(PyUnicodeObject self,
				2934	PyUnicodeObject *substring,
				2935	int maxcount)
				2936	{
				2937	PyObject *list;
				2938
				2939	if (maxcount < 0)
				2940	maxcount = INT_MAX;
				2941
				2942	list = PyList_New(0);
				2943	if (!list)
				2944	return NULL;
				2945
				2946	if (substring == NULL)
				2947	return split_whitespace(self,list,maxcount);
				2948
				2949	else if (substring->length == 1)
				2950	return split_char(self,list,substring->str[0],maxcount);
				2951
				2952	else if (substring->length == 0) {
				2953	Py_DECREF(list);
				2954	PyErr_SetString(PyExc_ValueError, "empty separator");
				2955	return NULL;
				2956	}
				2957	else
				2958	return split_substring(self,list,substring,maxcount);
				2959	}
				2960
				2961	static
				2962	PyObject strip(PyUnicodeObject self,
				2963	int left,
				2964	int right)
				2965	{
				2966	Py_UNICODE *p = self->str;
				2967	int start = 0;
				2968	int end = self->length;
				2969
				2970	if (left)
				2971	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2972	start++;
				2973
				2974	if (right)
				2975	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2976	end--;
				2977
				2978	if (start == 0 && end == self->length) {
				2979	/* couldn't strip anything off, return original string */
				2980	Py_INCREF(self);
				2981	return (PyObject*) self;
				2982	}
				2983
				2984	return (PyObject*) PyUnicode_FromUnicode(
				2985	self->str + start,
				2986	end - start
				2987	);
				2988	}
				2989
				2990	static
				2991	PyObject replace(PyUnicodeObject self,
				2992	PyUnicodeObject *str1,
				2993	PyUnicodeObject *str2,
				2994	int maxcount)
				2995	{
				2996	PyUnicodeObject *u;
				2997
				2998	if (maxcount < 0)
				2999	maxcount = INT_MAX;
				3000
				3001	if (str1->length == 1 && str2->length == 1) {
				3002	int i;
				3003
				3004	/* replace characters */
				3005	if (!findchar(self->str, self->length, str1->str[0])) {
				3006	/* nothing to replace, return original string */
				3007	Py_INCREF(self);
				3008	u = self;
				3009	} else {
				3010	Py_UNICODE u1 = str1->str[0];
				3011	Py_UNICODE u2 = str2->str[0];
				3012
				3013	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3014	self->str,
				3015	self->length
				3016	);
				3017	if (u)
				3018	for (i = 0; i < u->length; i++)
				3019	if (u->str[i] == u1) {
				3020	if (--maxcount < 0)
				3021	break;
				3022	u->str[i] = u2;
				3023	}
				3024	}
				3025
				3026	} else {
				3027	int n, i;
				3028	Py_UNICODE *p;
				3029
				3030	/* replace strings */
				3031	n = count(self, 0, self->length, str1);
				3032	if (n > maxcount)
				3033	n = maxcount;
				3034	if (n == 0) {
				3035	/* nothing to replace, return original string */
				3036	Py_INCREF(self);
				3037	u = self;
				3038	} else {
				3039	u = _PyUnicode_New(
				3040	self->length + n * (str2->length - str1->length));
				3041	if (u) {
				3042	i = 0;
				3043	p = u->str;
				3044	while (i <= self->length - str1->length)
				3045	if (Py_UNICODE_MATCH(self, i, str1)) {
				3046	/* replace string segment */
				3047	Py_UNICODE_COPY(p, str2->str, str2->length);
				3048	p += str2->length;
				3049	i += str1->length;
				3050	if (--n <= 0) {
				3051	/* copy remaining part */
				3052	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3053	break;
				3054	}
				3055	} else
				3056	*p++ = self->str[i++];
				3057	}
				3058	}
				3059	}
				3060
				3061	return (PyObject *) u;
				3062	}
				3063
				3064	/* --- Unicode Object Methods --------------------------------------------- */
				3065
				3066	static char title__doc__[] =
				3067	"S.title() -> unicode\n\
				3068	\n\
				3069	Return a titlecased version of S, i.e. words start with title case\n\
				3070	characters, all remaining cased characters have lower case.";
				3071
				3072	static PyObject*
				3073	unicode_title(PyUnicodeObject self, PyObject args)
				3074	{
				3075	if (!PyArg_NoArgs(args))
				3076	return NULL;
				3077	return fixup(self, fixtitle);
				3078	}
				3079
				3080	static char capitalize__doc__[] =
				3081	"S.capitalize() -> unicode\n\
				3082	\n\
				3083	Return a capitalized version of S, i.e. make the first character\n\
				3084	have upper case.";
				3085
				3086	static PyObject*
				3087	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3088	{
				3089	if (!PyArg_NoArgs(args))
				3090	return NULL;
				3091	return fixup(self, fixcapitalize);
				3092	}
				3093
				3094	#if 0
				3095	static char capwords__doc__[] =
				3096	"S.capwords() -> unicode\n\
				3097	\n\
				3098	Apply .capitalize() to all words in S and return the result with\n\
				3099	normalized whitespace (all whitespace strings are replaced by ' ').";
				3100
				3101	static PyObject*
				3102	unicode_capwords(PyUnicodeObject self, PyObject args)
				3103	{
				3104	PyObject *list;
				3105	PyObject *item;
				3106	int i;
				3107
				3108	if (!PyArg_NoArgs(args))
				3109	return NULL;
				3110
				3111	/* Split into words */
				3112	list = split(self, NULL, -1);
				3113	if (!list)
				3114	return NULL;
				3115
				3116	/* Capitalize each word */
				3117	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3118	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3119	fixcapitalize);
				3120	if (item == NULL)
				3121	goto onError;
				3122	Py_DECREF(PyList_GET_ITEM(list, i));
				3123	PyList_SET_ITEM(list, i, item);
				3124	}
				3125
				3126	/* Join the words to form a new string */
				3127	item = PyUnicode_Join(NULL, list);
				3128
				3129	onError:
				3130	Py_DECREF(list);
				3131	return (PyObject *)item;
				3132	}
				3133	#endif
				3134
				3135	static char center__doc__[] =
				3136	"S.center(width) -> unicode\n\
				3137	\n\
				3138	Return S centered in a Unicode string of length width. Padding is done\n\
				3139	using spaces.";
				3140
				3141	static PyObject *
				3142	unicode_center(PyUnicodeObject self, PyObject args)
				3143	{
				3144	int marg, left;
				3145	int width;
				3146
				3147	if (!PyArg_ParseTuple(args, "i:center", &width))
				3148	return NULL;
				3149
				3150	if (self->length >= width) {
				3151	Py_INCREF(self);
				3152	return (PyObject*) self;
				3153	}
				3154
				3155	marg = width - self->length;
				3156	left = marg / 2 + (marg & width & 1);
				3157
				3158	return (PyObject*) pad(self, left, marg - left, ' ');
				3159	}
				3160
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3161	#if 0
				3162
				3163	/* This code should go into some future Unicode collation support
				3164	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3165	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3166
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3167	/* speedy UTF-16 code point order comparison */
				3168	/* gleaned from: */
				3169	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3170
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3171	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3172	{
				3173	0, 0, 0, 0, 0, 0, 0, 0,
				3174	0, 0, 0, 0, 0, 0, 0, 0,
				3175	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3176	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3177	};
				3178
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3179	static int
				3180	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3181	{
				3182	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3183
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3184	Py_UNICODE *s1 = str1->str;
				3185	Py_UNICODE *s2 = str2->str;
				3186
				3187	len1 = str1->length;
				3188	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3189
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3190	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3191	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3192	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3193
				3194	c1 = *s1++;
				3195	c2 = *s2++;
				3196	if (c1 > (1<<11) * 26)
				3197	c1 += utf16Fixup[c1>>11];
				3198	if (c2 > (1<<11) * 26)
				3199	c2 += utf16Fixup[c2>>11];
				3200
				3201	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3202	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3203	if (diff)
				3204	return (diff < 0) ? -1 : (diff != 0);
				3205	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3206	}
				3207
				3208	return (len1 < len2) ? -1 : (len1 != len2);
				3209	}
				3210
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3211	#else
				3212
				3213	static int
				3214	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3215	{
				3216	register int len1, len2;
				3217
				3218	Py_UNICODE *s1 = str1->str;
				3219	Py_UNICODE *s2 = str2->str;
				3220
				3221	len1 = str1->length;
				3222	len2 = str2->length;
				3223
				3224	while (len1 > 0 && len2 > 0) {
				3225	register long diff;
				3226
				3227	diff = (long)s1++ - (long)s2++;
				3228	if (diff)
				3229	return (diff < 0) ? -1 : (diff != 0);
				3230	len1--; len2--;
				3231	}
				3232
				3233	return (len1 < len2) ? -1 : (len1 != len2);
				3234	}
				3235
				3236	#endif
				3237
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3238	int PyUnicode_Compare(PyObject *left,
				3239	PyObject *right)
				3240	{
				3241	PyUnicodeObject u = NULL, v = NULL;
				3242	int result;
				3243
				3244	/* Coerce the two arguments */
				3245	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3246	if (u == NULL)
				3247	goto onError;
				3248	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3249	if (v == NULL)
				3250	goto onError;
				3251
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3252	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3253	if (v == u) {
				3254	Py_DECREF(u);
				3255	Py_DECREF(v);
				3256	return 0;
				3257	}
				3258
				3259	result = unicode_compare(u, v);
				3260
				3261	Py_DECREF(u);
				3262	Py_DECREF(v);
				3263	return result;
				3264
				3265	onError:
				3266	Py_XDECREF(u);
				3267	Py_XDECREF(v);
				3268	return -1;
				3269	}
				3270
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3271	int PyUnicode_Contains(PyObject *container,
				3272	PyObject *element)
				3273	{
				3274	PyUnicodeObject u = NULL, v = NULL;
				3275	int result;
				3276	register const Py_UNICODE p, e;
				3277	register Py_UNICODE ch;
				3278
				3279	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3280	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3281	if (v == NULL) {
				3282	PyErr_SetString(PyExc_TypeError,
				3283	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3284	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3285	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3286	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3287	if (u == NULL) {
				3288	Py_DECREF(v);
				3289	goto onError;
				3290	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3291
				3292	/* Check v in u */
				3293	if (PyUnicode_GET_SIZE(v) != 1) {
				3294	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3295	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3296	goto onError;
				3297	}
				3298	ch = *PyUnicode_AS_UNICODE(v);
				3299	p = PyUnicode_AS_UNICODE(u);
				3300	e = p + PyUnicode_GET_SIZE(u);
				3301	result = 0;
				3302	while (p < e) {
				3303	if (*p++ == ch) {
				3304	result = 1;
				3305	break;
				3306	}
				3307	}
				3308
				3309	Py_DECREF(u);
				3310	Py_DECREF(v);
				3311	return result;
				3312
				3313	onError:
				3314	Py_XDECREF(u);
				3315	Py_XDECREF(v);
				3316	return -1;
				3317	}
				3318
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3319	/* Concat to string or Unicode object giving a new Unicode object. */
				3320
				3321	PyObject PyUnicode_Concat(PyObject left,
				3322	PyObject *right)
				3323	{
				3324	PyUnicodeObject u = NULL, v = NULL, *w;
				3325
				3326	/* Coerce the two arguments */
				3327	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3328	if (u == NULL)
				3329	goto onError;
				3330	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3331	if (v == NULL)
				3332	goto onError;
				3333
				3334	/* Shortcuts */
				3335	if (v == unicode_empty) {
				3336	Py_DECREF(v);
				3337	return (PyObject *)u;
				3338	}
				3339	if (u == unicode_empty) {
				3340	Py_DECREF(u);
				3341	return (PyObject *)v;
				3342	}
				3343
				3344	/* Concat the two Unicode strings */
				3345	w = _PyUnicode_New(u->length + v->length);
				3346	if (w == NULL)
				3347	goto onError;
				3348	Py_UNICODE_COPY(w->str, u->str, u->length);
				3349	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3350
				3351	Py_DECREF(u);
				3352	Py_DECREF(v);
				3353	return (PyObject *)w;
				3354
				3355	onError:
				3356	Py_XDECREF(u);
				3357	Py_XDECREF(v);
				3358	return NULL;
				3359	}
				3360
				3361	static char count__doc__[] =
				3362	"S.count(sub[, start[, end]]) -> int\n\
				3363	\n\
				3364	Return the number of occurrences of substring sub in Unicode string\n\
				3365	S[start:end]. Optional arguments start and end are\n\
				3366	interpreted as in slice notation.";
				3367
				3368	static PyObject *
				3369	unicode_count(PyUnicodeObject self, PyObject args)
				3370	{
				3371	PyUnicodeObject *substring;
				3372	int start = 0;
				3373	int end = INT_MAX;
				3374	PyObject *result;
				3375
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3376	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3377	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3378	return NULL;
				3379
				3380	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3381	(PyObject *)substring);
				3382	if (substring == NULL)
				3383	return NULL;
				3384
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3385	if (start < 0)
				3386	start += self->length;
				3387	if (start < 0)
				3388	start = 0;
				3389	if (end > self->length)
				3390	end = self->length;
				3391	if (end < 0)
				3392	end += self->length;
				3393	if (end < 0)
				3394	end = 0;
				3395
				3396	result = PyInt_FromLong((long) count(self, start, end, substring));
				3397
				3398	Py_DECREF(substring);
				3399	return result;
				3400	}
				3401
				3402	static char encode__doc__[] =
				3403	"S.encode([encoding[,errors]]) -> string\n\
				3404	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3405	Return an encoded string version of S. Default encoding is the current\n\
				3406	default string encoding. errors may be given to set a different error\n\
				3407	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3408	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3409
				3410	static PyObject *
				3411	unicode_encode(PyUnicodeObject self, PyObject args)
				3412	{
				3413	char *encoding = NULL;
				3414	char *errors = NULL;
				3415	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3416	return NULL;
				3417	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3418	}
				3419
				3420	static char expandtabs__doc__[] =
				3421	"S.expandtabs([tabsize]) -> unicode\n\
				3422	\n\
				3423	Return a copy of S where all tab characters are expanded using spaces.\n\
				3424	If tabsize is not given, a tab size of 8 characters is assumed.";
				3425
				3426	static PyObject*
				3427	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3428	{
				3429	Py_UNICODE *e;
				3430	Py_UNICODE *p;
				3431	Py_UNICODE *q;
				3432	int i, j;
				3433	PyUnicodeObject *u;
				3434	int tabsize = 8;
				3435
				3436	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3437	return NULL;
				3438
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3439	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3440	i = j = 0;
				3441	e = self->str + self->length;
				3442	for (p = self->str; p < e; p++)
				3443	if (*p == '\t') {
				3444	if (tabsize > 0)
				3445	j += tabsize - (j % tabsize);
				3446	}
				3447	else {
				3448	j++;
				3449	if (p == '\n' \|\| p == '\r') {
				3450	i += j;
				3451	j = 0;
				3452	}
				3453	}
				3454
				3455	/* Second pass: create output string and fill it */
				3456	u = _PyUnicode_New(i + j);
				3457	if (!u)
				3458	return NULL;
				3459
				3460	j = 0;
				3461	q = u->str;
				3462
				3463	for (p = self->str; p < e; p++)
				3464	if (*p == '\t') {
				3465	if (tabsize > 0) {
				3466	i = tabsize - (j % tabsize);
				3467	j += i;
				3468	while (i--)
				3469	*q++ = ' ';
				3470	}
				3471	}
				3472	else {
				3473	j++;
				3474	q++ = p;
				3475	if (p == '\n' \|\| p == '\r')
				3476	j = 0;
				3477	}
				3478
				3479	return (PyObject*) u;
				3480	}
				3481
				3482	static char find__doc__[] =
				3483	"S.find(sub [,start [,end]]) -> int\n\
				3484	\n\
				3485	Return the lowest index in S where substring sub is found,\n\
				3486	such that sub is contained within s[start,end]. Optional\n\
				3487	arguments start and end are interpreted as in slice notation.\n\
				3488	\n\
				3489	Return -1 on failure.";
				3490
				3491	static PyObject *
				3492	unicode_find(PyUnicodeObject self, PyObject args)
				3493	{
				3494	PyUnicodeObject *substring;
				3495	int start = 0;
				3496	int end = INT_MAX;
				3497	PyObject *result;
				3498
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3499	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3500	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3501	return NULL;
				3502	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3503	(PyObject *)substring);
				3504	if (substring == NULL)
				3505	return NULL;
				3506
				3507	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3508
				3509	Py_DECREF(substring);
				3510	return result;
				3511	}
				3512
				3513	static PyObject *
				3514	unicode_getitem(PyUnicodeObject *self, int index)
				3515	{
				3516	if (index < 0 \|\| index >= self->length) {
				3517	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3518	return NULL;
				3519	}
				3520
				3521	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3522	}
				3523
				3524	static long
				3525	unicode_hash(PyUnicodeObject *self)
				3526	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3527	/* Since Unicode objects compare equal to their ASCII string
				3528	counterparts, they should use the individual character values
				3529	as basis for their hash value. This is needed to assure that
				3530	strings and Unicode objects behave in the same way as
				3531	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3532
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3533	register int len;
				3534	register Py_UNICODE *p;
				3535	register long x;
				3536
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3537	if (self->hash != -1)
				3538	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3539	len = PyUnicode_GET_SIZE(self);
				3540	p = PyUnicode_AS_UNICODE(self);
				3541	x = *p << 7;
				3542	while (--len >= 0)
				3543	x = (1000003x) ^ p++;
				3544	x ^= PyUnicode_GET_SIZE(self);
				3545	if (x == -1)
				3546	x = -2;
				3547	self->hash = x;
				3548	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3549	}
				3550
				3551	static char index__doc__[] =
				3552	"S.index(sub [,start [,end]]) -> int\n\
				3553	\n\
				3554	Like S.find() but raise ValueError when the substring is not found.";
				3555
				3556	static PyObject *
				3557	unicode_index(PyUnicodeObject self, PyObject args)
				3558	{
				3559	int result;
				3560	PyUnicodeObject *substring;
				3561	int start = 0;
				3562	int end = INT_MAX;
				3563
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3564	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3565	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3566	return NULL;
				3567
				3568	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3569	(PyObject *)substring);
				3570	if (substring == NULL)
				3571	return NULL;
				3572
				3573	result = findstring(self, substring, start, end, 1);
				3574
				3575	Py_DECREF(substring);
				3576	if (result < 0) {
				3577	PyErr_SetString(PyExc_ValueError, "substring not found");
				3578	return NULL;
				3579	}
				3580	return PyInt_FromLong(result);
				3581	}
				3582
				3583	static char islower__doc__[] =
				3584	"S.islower() -> int\n\
				3585	\n\
				3586	Return 1 if all cased characters in S are lowercase and there is\n\
				3587	at least one cased character in S, 0 otherwise.";
				3588
				3589	static PyObject*
				3590	unicode_islower(PyUnicodeObject self, PyObject args)
				3591	{
				3592	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3593	register const Py_UNICODE *e;
				3594	int cased;
				3595
				3596	if (!PyArg_NoArgs(args))
				3597	return NULL;
				3598
				3599	/* Shortcut for single character strings */
				3600	if (PyUnicode_GET_SIZE(self) == 1)
				3601	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3602
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3603	/* Special case for empty strings */
				3604	if (PyString_GET_SIZE(self) == 0)
				3605	return PyInt_FromLong(0);
				3606
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3607	e = p + PyUnicode_GET_SIZE(self);
				3608	cased = 0;
				3609	for (; p < e; p++) {
				3610	register const Py_UNICODE ch = *p;
				3611
				3612	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3613	return PyInt_FromLong(0);
				3614	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3615	cased = 1;
				3616	}
				3617	return PyInt_FromLong(cased);
				3618	}
				3619
				3620	static char isupper__doc__[] =
				3621	"S.isupper() -> int\n\
				3622	\n\
				3623	Return 1 if all cased characters in S are uppercase and there is\n\
				3624	at least one cased character in S, 0 otherwise.";
				3625
				3626	static PyObject*
				3627	unicode_isupper(PyUnicodeObject self, PyObject args)
				3628	{
				3629	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3630	register const Py_UNICODE *e;
				3631	int cased;
				3632
				3633	if (!PyArg_NoArgs(args))
				3634	return NULL;
				3635
				3636	/* Shortcut for single character strings */
				3637	if (PyUnicode_GET_SIZE(self) == 1)
				3638	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3639
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3640	/* Special case for empty strings */
				3641	if (PyString_GET_SIZE(self) == 0)
				3642	return PyInt_FromLong(0);
				3643
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3644	e = p + PyUnicode_GET_SIZE(self);
				3645	cased = 0;
				3646	for (; p < e; p++) {
				3647	register const Py_UNICODE ch = *p;
				3648
				3649	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3650	return PyInt_FromLong(0);
				3651	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3652	cased = 1;
				3653	}
				3654	return PyInt_FromLong(cased);
				3655	}
				3656
				3657	static char istitle__doc__[] =
				3658	"S.istitle() -> int\n\
				3659	\n\
				3660	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3661	may only follow uncased characters and lowercase characters only cased\n\
				3662	ones. Return 0 otherwise.";
				3663
				3664	static PyObject*
				3665	unicode_istitle(PyUnicodeObject self, PyObject args)
				3666	{
				3667	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3668	register const Py_UNICODE *e;
				3669	int cased, previous_is_cased;
				3670
				3671	if (!PyArg_NoArgs(args))
				3672	return NULL;
				3673
				3674	/* Shortcut for single character strings */
				3675	if (PyUnicode_GET_SIZE(self) == 1)
				3676	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3677	(Py_UNICODE_ISUPPER(*p) != 0));
				3678
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3679	/* Special case for empty strings */
				3680	if (PyString_GET_SIZE(self) == 0)
				3681	return PyInt_FromLong(0);
				3682
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3683	e = p + PyUnicode_GET_SIZE(self);
				3684	cased = 0;
				3685	previous_is_cased = 0;
				3686	for (; p < e; p++) {
				3687	register const Py_UNICODE ch = *p;
				3688
				3689	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3690	if (previous_is_cased)
				3691	return PyInt_FromLong(0);
				3692	previous_is_cased = 1;
				3693	cased = 1;
				3694	}
				3695	else if (Py_UNICODE_ISLOWER(ch)) {
				3696	if (!previous_is_cased)
				3697	return PyInt_FromLong(0);
				3698	previous_is_cased = 1;
				3699	cased = 1;
				3700	}
				3701	else
				3702	previous_is_cased = 0;
				3703	}
				3704	return PyInt_FromLong(cased);
				3705	}
				3706
				3707	static char isspace__doc__[] =
				3708	"S.isspace() -> int\n\
				3709	\n\
				3710	Return 1 if there are only whitespace characters in S,\n\
				3711	0 otherwise.";
				3712
				3713	static PyObject*
				3714	unicode_isspace(PyUnicodeObject self, PyObject args)
				3715	{
				3716	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3717	register const Py_UNICODE *e;
				3718
				3719	if (!PyArg_NoArgs(args))
				3720	return NULL;
				3721
				3722	/* Shortcut for single character strings */
				3723	if (PyUnicode_GET_SIZE(self) == 1 &&
				3724	Py_UNICODE_ISSPACE(*p))
				3725	return PyInt_FromLong(1);
				3726
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3727	/* Special case for empty strings */
				3728	if (PyString_GET_SIZE(self) == 0)
				3729	return PyInt_FromLong(0);
				3730
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3731	e = p + PyUnicode_GET_SIZE(self);
				3732	for (; p < e; p++) {
				3733	if (!Py_UNICODE_ISSPACE(*p))
				3734	return PyInt_FromLong(0);
				3735	}
				3736	return PyInt_FromLong(1);
				3737	}
				3738
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3739	static char isalpha__doc__[] =
				3740	"S.isalpha() -> int\n\
				3741	\n\
				3742	Return 1 if all characters in S are alphabetic\n\
				3743	and there is at least one character in S, 0 otherwise.";
				3744
				3745	static PyObject*
				3746	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3747	{
				3748	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3749	register const Py_UNICODE *e;
				3750
				3751	if (!PyArg_NoArgs(args))
				3752	return NULL;
				3753
				3754	/* Shortcut for single character strings */
				3755	if (PyUnicode_GET_SIZE(self) == 1 &&
				3756	Py_UNICODE_ISALPHA(*p))
				3757	return PyInt_FromLong(1);
				3758
				3759	/* Special case for empty strings */
				3760	if (PyString_GET_SIZE(self) == 0)
				3761	return PyInt_FromLong(0);
				3762
				3763	e = p + PyUnicode_GET_SIZE(self);
				3764	for (; p < e; p++) {
				3765	if (!Py_UNICODE_ISALPHA(*p))
				3766	return PyInt_FromLong(0);
				3767	}
				3768	return PyInt_FromLong(1);
				3769	}
				3770
				3771	static char isalnum__doc__[] =
				3772	"S.isalnum() -> int\n\
				3773	\n\
				3774	Return 1 if all characters in S are alphanumeric\n\
				3775	and there is at least one character in S, 0 otherwise.";
				3776
				3777	static PyObject*
				3778	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3779	{
				3780	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3781	register const Py_UNICODE *e;
				3782
				3783	if (!PyArg_NoArgs(args))
				3784	return NULL;
				3785
				3786	/* Shortcut for single character strings */
				3787	if (PyUnicode_GET_SIZE(self) == 1 &&
				3788	Py_UNICODE_ISALNUM(*p))
				3789	return PyInt_FromLong(1);
				3790
				3791	/* Special case for empty strings */
				3792	if (PyString_GET_SIZE(self) == 0)
				3793	return PyInt_FromLong(0);
				3794
				3795	e = p + PyUnicode_GET_SIZE(self);
				3796	for (; p < e; p++) {
				3797	if (!Py_UNICODE_ISALNUM(*p))
				3798	return PyInt_FromLong(0);
				3799	}
				3800	return PyInt_FromLong(1);
				3801	}
				3802
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3803	static char isdecimal__doc__[] =
				3804	"S.isdecimal() -> int\n\
				3805	\n\
				3806	Return 1 if there are only decimal characters in S,\n\
				3807	0 otherwise.";
				3808
				3809	static PyObject*
				3810	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3811	{
				3812	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3813	register const Py_UNICODE *e;
				3814
				3815	if (!PyArg_NoArgs(args))
				3816	return NULL;
				3817
				3818	/* Shortcut for single character strings */
				3819	if (PyUnicode_GET_SIZE(self) == 1 &&
				3820	Py_UNICODE_ISDECIMAL(*p))
				3821	return PyInt_FromLong(1);
				3822
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3823	/* Special case for empty strings */
				3824	if (PyString_GET_SIZE(self) == 0)
				3825	return PyInt_FromLong(0);
				3826
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3827	e = p + PyUnicode_GET_SIZE(self);
				3828	for (; p < e; p++) {
				3829	if (!Py_UNICODE_ISDECIMAL(*p))
				3830	return PyInt_FromLong(0);
				3831	}
				3832	return PyInt_FromLong(1);
				3833	}
				3834
				3835	static char isdigit__doc__[] =
				3836	"S.isdigit() -> int\n\
				3837	\n\
				3838	Return 1 if there are only digit characters in S,\n\
				3839	0 otherwise.";
				3840
				3841	static PyObject*
				3842	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3843	{
				3844	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3845	register const Py_UNICODE *e;
				3846
				3847	if (!PyArg_NoArgs(args))
				3848	return NULL;
				3849
				3850	/* Shortcut for single character strings */
				3851	if (PyUnicode_GET_SIZE(self) == 1 &&
				3852	Py_UNICODE_ISDIGIT(*p))
				3853	return PyInt_FromLong(1);
				3854
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3855	/* Special case for empty strings */
				3856	if (PyString_GET_SIZE(self) == 0)
				3857	return PyInt_FromLong(0);
				3858
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3859	e = p + PyUnicode_GET_SIZE(self);
				3860	for (; p < e; p++) {
				3861	if (!Py_UNICODE_ISDIGIT(*p))
				3862	return PyInt_FromLong(0);
				3863	}
				3864	return PyInt_FromLong(1);
				3865	}
				3866
				3867	static char isnumeric__doc__[] =
				3868	"S.isnumeric() -> int\n\
				3869	\n\
				3870	Return 1 if there are only numeric characters in S,\n\
				3871	0 otherwise.";
				3872
				3873	static PyObject*
				3874	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3875	{
				3876	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3877	register const Py_UNICODE *e;
				3878
				3879	if (!PyArg_NoArgs(args))
				3880	return NULL;
				3881
				3882	/* Shortcut for single character strings */
				3883	if (PyUnicode_GET_SIZE(self) == 1 &&
				3884	Py_UNICODE_ISNUMERIC(*p))
				3885	return PyInt_FromLong(1);
				3886
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3887	/* Special case for empty strings */
				3888	if (PyString_GET_SIZE(self) == 0)
				3889	return PyInt_FromLong(0);
				3890
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3891	e = p + PyUnicode_GET_SIZE(self);
				3892	for (; p < e; p++) {
				3893	if (!Py_UNICODE_ISNUMERIC(*p))
				3894	return PyInt_FromLong(0);
				3895	}
				3896	return PyInt_FromLong(1);
				3897	}
				3898
				3899	static char join__doc__[] =
				3900	"S.join(sequence) -> unicode\n\
				3901	\n\
				3902	Return a string which is the concatenation of the strings in the\n\
				3903	sequence. The separator between elements is S.";
				3904
				3905	static PyObject*
				3906	unicode_join(PyUnicodeObject self, PyObject args)
				3907	{
				3908	PyObject *data;
				3909	if (!PyArg_ParseTuple(args, "O:join", &data))
				3910	return NULL;
				3911
				3912	return PyUnicode_Join((PyObject *)self, data);
				3913	}
				3914
				3915	static int
				3916	unicode_length(PyUnicodeObject *self)
				3917	{
				3918	return self->length;
				3919	}
				3920
				3921	static char ljust__doc__[] =
				3922	"S.ljust(width) -> unicode\n\
				3923	\n\
				3924	Return S left justified in a Unicode string of length width. Padding is\n\
				3925	done using spaces.";
				3926
				3927	static PyObject *
				3928	unicode_ljust(PyUnicodeObject self, PyObject args)
				3929	{
				3930	int width;
				3931	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3932	return NULL;
				3933
				3934	if (self->length >= width) {
				3935	Py_INCREF(self);
				3936	return (PyObject*) self;
				3937	}
				3938
				3939	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3940	}
				3941
				3942	static char lower__doc__[] =
				3943	"S.lower() -> unicode\n\
				3944	\n\
				3945	Return a copy of the string S converted to lowercase.";
				3946
				3947	static PyObject*
				3948	unicode_lower(PyUnicodeObject self, PyObject args)
				3949	{
				3950	if (!PyArg_NoArgs(args))
				3951	return NULL;
				3952	return fixup(self, fixlower);
				3953	}
				3954
				3955	static char lstrip__doc__[] =
				3956	"S.lstrip() -> unicode\n\
				3957	\n\
				3958	Return a copy of the string S with leading whitespace removed.";
				3959
				3960	static PyObject *
				3961	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3962	{
				3963	if (!PyArg_NoArgs(args))
				3964	return NULL;
				3965	return strip(self, 1, 0);
				3966	}
				3967
				3968	static PyObject*
				3969	unicode_repeat(PyUnicodeObject *str, int len)
				3970	{
				3971	PyUnicodeObject *u;
				3972	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3973	int nchars;
				3974	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3975
				3976	if (len < 0)
				3977	len = 0;
				3978
				3979	if (len == 1) {
				3980	/* no repeat, return original string */
				3981	Py_INCREF(str);
				3982	return (PyObject*) str;
				3983	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3984
				3985	/* ensure # of chars needed doesn't overflow int and # of bytes
				3986	* needed doesn't overflow size_t
				3987	*/
				3988	nchars = len * str->length;
				3989	if (len && nchars / len != str->length) {
				3990	PyErr_SetString(PyExc_OverflowError,
				3991	"repeated string is too long");
				3992	return NULL;
				3993	}
				3994	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				3995	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				3996	PyErr_SetString(PyExc_OverflowError,
				3997	"repeated string is too long");
				3998	return NULL;
				3999	}
				4000	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4001	if (!u)
				4002	return NULL;
				4003
				4004	p = u->str;
				4005
				4006	while (len-- > 0) {
				4007	Py_UNICODE_COPY(p, str->str, str->length);
				4008	p += str->length;
				4009	}
				4010
				4011	return (PyObject*) u;
				4012	}
				4013
				4014	PyObject PyUnicode_Replace(PyObject obj,
				4015	PyObject *subobj,
				4016	PyObject *replobj,
				4017	int maxcount)
				4018	{
				4019	PyObject *self;
				4020	PyObject *str1;
				4021	PyObject *str2;
				4022	PyObject *result;
				4023
				4024	self = PyUnicode_FromObject(obj);
				4025	if (self == NULL)
				4026	return NULL;
				4027	str1 = PyUnicode_FromObject(subobj);
				4028	if (str1 == NULL) {
				4029	Py_DECREF(self);
				4030	return NULL;
				4031	}
				4032	str2 = PyUnicode_FromObject(replobj);
				4033	if (str2 == NULL) {
				4034	Py_DECREF(self);
				4035	Py_DECREF(str1);
				4036	return NULL;
				4037	}
				4038	result = replace((PyUnicodeObject *)self,
				4039	(PyUnicodeObject *)str1,
				4040	(PyUnicodeObject *)str2,
				4041	maxcount);
				4042	Py_DECREF(self);
				4043	Py_DECREF(str1);
				4044	Py_DECREF(str2);
				4045	return result;
				4046	}
				4047
				4048	static char replace__doc__[] =
				4049	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4050	\n\
				4051	Return a copy of S with all occurrences of substring\n\
				4052	old replaced by new. If the optional argument maxsplit is\n\
				4053	given, only the first maxsplit occurrences are replaced.";
				4054
				4055	static PyObject*
				4056	unicode_replace(PyUnicodeObject self, PyObject args)
				4057	{
				4058	PyUnicodeObject *str1;
				4059	PyUnicodeObject *str2;
				4060	int maxcount = -1;
				4061	PyObject *result;
				4062
				4063	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4064	return NULL;
				4065	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4066	if (str1 == NULL)
				4067	return NULL;
				4068	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4069	if (str2 == NULL)
				4070	return NULL;
				4071
				4072	result = replace(self, str1, str2, maxcount);
				4073
				4074	Py_DECREF(str1);
				4075	Py_DECREF(str2);
				4076	return result;
				4077	}
				4078
				4079	static
				4080	PyObject unicode_repr(PyObject unicode)
				4081	{
				4082	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4083	PyUnicode_GET_SIZE(unicode),
				4084	1);
				4085	}
				4086
				4087	static char rfind__doc__[] =
				4088	"S.rfind(sub [,start [,end]]) -> int\n\
				4089	\n\
				4090	Return the highest index in S where substring sub is found,\n\
				4091	such that sub is contained within s[start,end]. Optional\n\
				4092	arguments start and end are interpreted as in slice notation.\n\
				4093	\n\
				4094	Return -1 on failure.";
				4095
				4096	static PyObject *
				4097	unicode_rfind(PyUnicodeObject self, PyObject args)
				4098	{
				4099	PyUnicodeObject *substring;
				4100	int start = 0;
				4101	int end = INT_MAX;
				4102	PyObject *result;
				4103
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4104	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4105	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4106	return NULL;
				4107	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4108	(PyObject *)substring);
				4109	if (substring == NULL)
				4110	return NULL;
				4111
				4112	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4113
				4114	Py_DECREF(substring);
				4115	return result;
				4116	}
				4117
				4118	static char rindex__doc__[] =
				4119	"S.rindex(sub [,start [,end]]) -> int\n\
				4120	\n\
				4121	Like S.rfind() but raise ValueError when the substring is not found.";
				4122
				4123	static PyObject *
				4124	unicode_rindex(PyUnicodeObject self, PyObject args)
				4125	{
				4126	int result;
				4127	PyUnicodeObject *substring;
				4128	int start = 0;
				4129	int end = INT_MAX;
				4130
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4131	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4132	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4133	return NULL;
				4134	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4135	(PyObject *)substring);
				4136	if (substring == NULL)
				4137	return NULL;
				4138
				4139	result = findstring(self, substring, start, end, -1);
				4140
				4141	Py_DECREF(substring);
				4142	if (result < 0) {
				4143	PyErr_SetString(PyExc_ValueError, "substring not found");
				4144	return NULL;
				4145	}
				4146	return PyInt_FromLong(result);
				4147	}
				4148
				4149	static char rjust__doc__[] =
				4150	"S.rjust(width) -> unicode\n\
				4151	\n\
				4152	Return S right justified in a Unicode string of length width. Padding is\n\
				4153	done using spaces.";
				4154
				4155	static PyObject *
				4156	unicode_rjust(PyUnicodeObject self, PyObject args)
				4157	{
				4158	int width;
				4159	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4160	return NULL;
				4161
				4162	if (self->length >= width) {
				4163	Py_INCREF(self);
				4164	return (PyObject*) self;
				4165	}
				4166
				4167	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4168	}
				4169
				4170	static char rstrip__doc__[] =
				4171	"S.rstrip() -> unicode\n\
				4172	\n\
				4173	Return a copy of the string S with trailing whitespace removed.";
				4174
				4175	static PyObject *
				4176	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4177	{
				4178	if (!PyArg_NoArgs(args))
				4179	return NULL;
				4180	return strip(self, 0, 1);
				4181	}
				4182
				4183	static PyObject*
				4184	unicode_slice(PyUnicodeObject *self, int start, int end)
				4185	{
				4186	/* standard clamping */
				4187	if (start < 0)
				4188	start = 0;
				4189	if (end < 0)
				4190	end = 0;
				4191	if (end > self->length)
				4192	end = self->length;
				4193	if (start == 0 && end == self->length) {
				4194	/* full slice, return original string */
				4195	Py_INCREF(self);
				4196	return (PyObject*) self;
				4197	}
				4198	if (start > end)
				4199	start = end;
				4200	/* copy slice */
				4201	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4202	end - start);
				4203	}
				4204
				4205	PyObject PyUnicode_Split(PyObject s,
				4206	PyObject *sep,
				4207	int maxsplit)
				4208	{
				4209	PyObject *result;
				4210
				4211	s = PyUnicode_FromObject(s);
				4212	if (s == NULL)
				4213	return NULL;
				4214	if (sep != NULL) {
				4215	sep = PyUnicode_FromObject(sep);
				4216	if (sep == NULL) {
				4217	Py_DECREF(s);
				4218	return NULL;
				4219	}
				4220	}
				4221
				4222	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4223
				4224	Py_DECREF(s);
				4225	Py_XDECREF(sep);
				4226	return result;
				4227	}
				4228
				4229	static char split__doc__[] =
				4230	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4231	\n\
				4232	Return a list of the words in S, using sep as the\n\
				4233	delimiter string. If maxsplit is given, at most maxsplit\n\
				4234	splits are done. If sep is not specified, any whitespace string\n\
				4235	is a separator.";
				4236
				4237	static PyObject*
				4238	unicode_split(PyUnicodeObject self, PyObject args)
				4239	{
				4240	PyObject *substring = Py_None;
				4241	int maxcount = -1;
				4242
				4243	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4244	return NULL;
				4245
				4246	if (substring == Py_None)
				4247	return split(self, NULL, maxcount);
				4248	else if (PyUnicode_Check(substring))
				4249	return split(self, (PyUnicodeObject *)substring, maxcount);
				4250	else
				4251	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4252	}
				4253
				4254	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4255	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4256	\n\
				4257	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4258	Line breaks are not included in the resulting list unless keepends\n\
				4259	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4260
				4261	static PyObject*
				4262	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4263	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4264	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4265
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4266	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4267	return NULL;
				4268
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4269	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4270	}
				4271
				4272	static
				4273	PyObject unicode_str(PyUnicodeObject self)
				4274	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4275	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4276	}
				4277
				4278	static char strip__doc__[] =
				4279	"S.strip() -> unicode\n\
				4280	\n\
				4281	Return a copy of S with leading and trailing whitespace removed.";
				4282
				4283	static PyObject *
				4284	unicode_strip(PyUnicodeObject self, PyObject args)
				4285	{
				4286	if (!PyArg_NoArgs(args))
				4287	return NULL;
				4288	return strip(self, 1, 1);
				4289	}
				4290
				4291	static char swapcase__doc__[] =
				4292	"S.swapcase() -> unicode\n\
				4293	\n\
				4294	Return a copy of S with uppercase characters converted to lowercase\n\
				4295	and vice versa.";
				4296
				4297	static PyObject*
				4298	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4299	{
				4300	if (!PyArg_NoArgs(args))
				4301	return NULL;
				4302	return fixup(self, fixswapcase);
				4303	}
				4304
				4305	static char translate__doc__[] =
				4306	"S.translate(table) -> unicode\n\
				4307	\n\
				4308	Return a copy of the string S, where all characters have been mapped\n\
				4309	through the given translation table, which must be a mapping of\n\
				4310	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4311	are left untouched. Characters mapped to None are deleted.";
				4312
				4313	static PyObject*
				4314	unicode_translate(PyUnicodeObject self, PyObject args)
				4315	{
				4316	PyObject *table;
				4317
				4318	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4319	return NULL;
				4320	return PyUnicode_TranslateCharmap(self->str,
				4321	self->length,
				4322	table,
				4323	"ignore");
				4324	}
				4325
				4326	static char upper__doc__[] =
				4327	"S.upper() -> unicode\n\
				4328	\n\
				4329	Return a copy of S converted to uppercase.";
				4330
				4331	static PyObject*
				4332	unicode_upper(PyUnicodeObject self, PyObject args)
				4333	{
				4334	if (!PyArg_NoArgs(args))
				4335	return NULL;
				4336	return fixup(self, fixupper);
				4337	}
				4338
				4339	#if 0
				4340	static char zfill__doc__[] =
				4341	"S.zfill(width) -> unicode\n\
				4342	\n\
				4343	Pad a numeric string x with zeros on the left, to fill a field\n\
				4344	of the specified width. The string x is never truncated.";
				4345
				4346	static PyObject *
				4347	unicode_zfill(PyUnicodeObject self, PyObject args)
				4348	{
				4349	int fill;
				4350	PyUnicodeObject *u;
				4351
				4352	int width;
				4353	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4354	return NULL;
				4355
				4356	if (self->length >= width) {
				4357	Py_INCREF(self);
				4358	return (PyObject*) self;
				4359	}
				4360
				4361	fill = width - self->length;
				4362
				4363	u = pad(self, fill, 0, '0');
				4364
				4365	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4366	/* move sign to beginning of string */
				4367	u->str[0] = u->str[fill];
				4368	u->str[fill] = '0';
				4369	}
				4370
				4371	return (PyObject*) u;
				4372	}
				4373	#endif
				4374
				4375	#if 0
				4376	static PyObject*
				4377	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4378	{
				4379	if (!PyArg_NoArgs(args))
				4380	return NULL;
				4381	return PyInt_FromLong(unicode_freelist_size);
				4382	}
				4383	#endif
				4384
				4385	static char startswith__doc__[] =
				4386	"S.startswith(prefix[, start[, end]]) -> int\n\
				4387	\n\
				4388	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4389	optional start, test S beginning at that position. With optional end, stop\n\
				4390	comparing S at that position.";
				4391
				4392	static PyObject *
				4393	unicode_startswith(PyUnicodeObject *self,
				4394	PyObject *args)
				4395	{
				4396	PyUnicodeObject *substring;
				4397	int start = 0;
				4398	int end = INT_MAX;
				4399	PyObject *result;
				4400
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4401	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4402	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4403	return NULL;
				4404	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4405	(PyObject *)substring);
				4406	if (substring == NULL)
				4407	return NULL;
				4408
				4409	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4410
				4411	Py_DECREF(substring);
				4412	return result;
				4413	}
				4414
				4415
				4416	static char endswith__doc__[] =
				4417	"S.endswith(suffix[, start[, end]]) -> int\n\
				4418	\n\
				4419	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4420	optional start, test S beginning at that position. With optional end, stop\n\
				4421	comparing S at that position.";
				4422
				4423	static PyObject *
				4424	unicode_endswith(PyUnicodeObject *self,
				4425	PyObject *args)
				4426	{
				4427	PyUnicodeObject *substring;
				4428	int start = 0;
				4429	int end = INT_MAX;
				4430	PyObject *result;
				4431
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4432	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4433	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4434	return NULL;
				4435	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4436	(PyObject *)substring);
				4437	if (substring == NULL)
				4438	return NULL;
				4439
				4440	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4441
				4442	Py_DECREF(substring);
				4443	return result;
				4444	}
				4445
				4446
				4447	static PyMethodDef unicode_methods[] = {
				4448
				4449	/* Order is according to common usage: often used methods should
				4450	appear first, since lookup is done sequentially. */
				4451
				4452	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4453	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4454	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4455	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4456	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4457	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4458	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4459	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4460	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4461	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4462	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4463	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4464	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4465	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4466	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4467	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4468	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4469	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4470	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4471	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4472	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4473	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4474	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4475	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4476	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4477	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4478	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4479	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4480	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4481	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4482	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4483	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4484	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4485	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4486	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4487	#if 0
				4488	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4489	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4490	#endif
				4491
				4492	#if 0
				4493	/* This one is just used for debugging the implementation. */
				4494	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4495	#endif
				4496
				4497	{NULL, NULL}
				4498	};
				4499
				4500	static PyObject *
				4501	unicode_getattr(PyUnicodeObject self, char name)
				4502	{
				4503	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4504	}
				4505
				4506	static PySequenceMethods unicode_as_sequence = {
				4507	(inquiry) unicode_length, /* sq_length */
				4508	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4509	(intargfunc) unicode_repeat, /* sq_repeat */
				4510	(intargfunc) unicode_getitem, /* sq_item */
				4511	(intintargfunc) unicode_slice, /* sq_slice */
				4512	0, /* sq_ass_item */
				4513	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4514	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4515	};
				4516
				4517	static int
				4518	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4519	int index,
				4520	const void **ptr)
				4521	{
				4522	if (index != 0) {
				4523	PyErr_SetString(PyExc_SystemError,
				4524	"accessing non-existent unicode segment");
				4525	return -1;
				4526	}
				4527	ptr = (void ) self->str;
				4528	return PyUnicode_GET_DATA_SIZE(self);
				4529	}
				4530
				4531	static int
				4532	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4533	const void **ptr)
				4534	{
				4535	PyErr_SetString(PyExc_TypeError,
				4536	"cannot use unicode as modifyable buffer");
				4537	return -1;
				4538	}
				4539
				4540	static int
				4541	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4542	int *lenp)
				4543	{
				4544	if (lenp)
				4545	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4546	return 1;
				4547	}
				4548
				4549	static int
				4550	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4551	int index,
				4552	const void **ptr)
				4553	{
				4554	PyObject *str;
				4555
				4556	if (index != 0) {
				4557	PyErr_SetString(PyExc_SystemError,
				4558	"accessing non-existent unicode segment");
				4559	return -1;
				4560	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4561	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4562	if (str == NULL)
				4563	return -1;
				4564	ptr = (void ) PyString_AS_STRING(str);
				4565	return PyString_GET_SIZE(str);
				4566	}
				4567
				4568	/* Helpers for PyUnicode_Format() */
				4569
				4570	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4571	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4572	{
				4573	int argidx = *p_argidx;
				4574	if (argidx < arglen) {
				4575	(*p_argidx)++;
				4576	if (arglen < 0)
				4577	return args;
				4578	else
				4579	return PyTuple_GetItem(args, argidx);
				4580	}
				4581	PyErr_SetString(PyExc_TypeError,
				4582	"not enough arguments for format string");
				4583	return NULL;
				4584	}
				4585
				4586	#define F_LJUST (1<<0)
				4587	#define F_SIGN (1<<1)
				4588	#define F_BLANK (1<<2)
				4589	#define F_ALT (1<<3)
				4590	#define F_ZERO (1<<4)
				4591
				4592	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4593	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4594	{
				4595	register int i;
				4596	int len;
				4597	va_list va;
				4598	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4599	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4600
				4601	/* First, format the string as char array, then expand to Py_UNICODE
				4602	array. */
				4603	charbuffer = (char *)buffer;
				4604	len = vsprintf(charbuffer, format, va);
				4605	for (i = len - 1; i >= 0; i--)
				4606	buffer[i] = (Py_UNICODE) charbuffer[i];
				4607
				4608	va_end(va);
				4609	return len;
				4610	}
				4611
				4612	static int
				4613	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4614	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4615	int flags,
				4616	int prec,
				4617	int type,
				4618	PyObject *v)
				4619	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4620	/* fmt = '%#.' + `prec` + `type`
				4621	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4622	char fmt[20];
				4623	double x;
				4624
				4625	x = PyFloat_AsDouble(v);
				4626	if (x == -1.0 && PyErr_Occurred())
				4627	return -1;
				4628	if (prec < 0)
				4629	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4630	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4631	type = 'g';
				4632	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4633	/* worst case length calc to ensure no buffer overrun:
				4634	fmt = %#.<prec>g
				4635	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4636	for any double rep.)
				4637	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4638	If prec=0 the effective precision is 1 (the leading digit is
				4639	always given), therefore increase by one to 10+prec. */
				4640	if (buflen <= (size_t)10 + (size_t)prec) {
				4641	PyErr_SetString(PyExc_OverflowError,
				4642	"formatted float is too long (precision too long?)");
				4643	return -1;
				4644	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4645	return usprintf(buf, fmt, x);
				4646	}
				4647
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4648	static PyObject*
				4649	formatlong(PyObject *val, int flags, int prec, int type)
				4650	{
				4651	char *buf;
				4652	int i, len;
				4653	PyObject str; / temporary string object. */
				4654	PyUnicodeObject *result;
				4655
				4656	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4657	if (!str)
				4658	return NULL;
				4659	result = _PyUnicode_New(len);
				4660	for (i = 0; i < len; i++)
				4661	result->str[i] = buf[i];
				4662	result->str[len] = 0;
				4663	Py_DECREF(str);
				4664	return (PyObject*)result;
				4665	}
				4666
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4667	static int
				4668	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4669	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4670	int flags,
				4671	int prec,
				4672	int type,
				4673	PyObject *v)
				4674	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4675	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4676	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4677	+ 1 + 1 = 24*/
				4678	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4679	long x;
				4680
				4681	x = PyInt_AsLong(v);
				4682	if (x == -1 && PyErr_Occurred())
				4683	return -1;
				4684	if (prec < 0)
				4685	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4686	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4687	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4688	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4689	PyErr_SetString(PyExc_OverflowError,
				4690	"formatted integer is too long (precision too long?)");
				4691	return -1;
				4692	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4693	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4694	return usprintf(buf, fmt, x);
				4695	}
				4696
				4697	static int
				4698	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4699	size_t buflen,
				4700	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4701	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4702	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4703	if (PyUnicode_Check(v)) {
				4704	if (PyUnicode_GET_SIZE(v) != 1)
				4705	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4706	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4707	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4708
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4709	else if (PyString_Check(v)) {
				4710	if (PyString_GET_SIZE(v) != 1)
				4711	goto onError;
				4712	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4713	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4714
				4715	else {
				4716	/* Integer input truncated to a character */
				4717	long x;
				4718	x = PyInt_AsLong(v);
				4719	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4720	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4721	buf[0] = (char) x;
				4722	}
				4723	buf[1] = '\0';
				4724	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4725
				4726	onError:
				4727	PyErr_SetString(PyExc_TypeError,
				4728	"%c requires int or char");
				4729	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4730	}
				4731
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4732	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4733
				4734	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4735	chars are formatted. XXX This is a magic number. Each formatting
				4736	routine does bounds checking to ensure no overflow, but a better
				4737	solution may be to malloc a buffer of appropriate size for each
				4738	format. For now, the current solution is sufficient.
				4739	*/
				4740	#define FORMATBUFLEN (size_t)120
				4741
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4742	PyObject PyUnicode_Format(PyObject format,
				4743	PyObject *args)
				4744	{
				4745	Py_UNICODE fmt, res;
				4746	int fmtcnt, rescnt, reslen, arglen, argidx;
				4747	int args_owned = 0;
				4748	PyUnicodeObject *result = NULL;
				4749	PyObject *dict = NULL;
				4750	PyObject *uformat;
				4751
				4752	if (format == NULL \|\| args == NULL) {
				4753	PyErr_BadInternalCall();
				4754	return NULL;
				4755	}
				4756	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4757	if (uformat == NULL)
				4758	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4759	fmt = PyUnicode_AS_UNICODE(uformat);
				4760	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4761
				4762	reslen = rescnt = fmtcnt + 100;
				4763	result = _PyUnicode_New(reslen);
				4764	if (result == NULL)
				4765	goto onError;
				4766	res = PyUnicode_AS_UNICODE(result);
				4767
				4768	if (PyTuple_Check(args)) {
				4769	arglen = PyTuple_Size(args);
				4770	argidx = 0;
				4771	}
				4772	else {
				4773	arglen = -1;
				4774	argidx = -2;
				4775	}
				4776	if (args->ob_type->tp_as_mapping)
				4777	dict = args;
				4778
				4779	while (--fmtcnt >= 0) {
				4780	if (*fmt != '%') {
				4781	if (--rescnt < 0) {
				4782	rescnt = fmtcnt + 100;
				4783	reslen += rescnt;
				4784	if (_PyUnicode_Resize(result, reslen) < 0)
				4785	return NULL;
				4786	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4787	--rescnt;
				4788	}
				4789	res++ = fmt++;
				4790	}
				4791	else {
				4792	/* Got a format specifier */
				4793	int flags = 0;
				4794	int width = -1;
				4795	int prec = -1;
				4796	int size = 0;
				4797	Py_UNICODE c = '\0';
				4798	Py_UNICODE fill;
				4799	PyObject *v = NULL;
				4800	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4801	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4802	Py_UNICODE sign;
				4803	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4804	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4805
				4806	fmt++;
				4807	if (*fmt == '(') {
				4808	Py_UNICODE *keystart;
				4809	int keylen;
				4810	PyObject *key;
				4811	int pcount = 1;
				4812
				4813	if (dict == NULL) {
				4814	PyErr_SetString(PyExc_TypeError,
				4815	"format requires a mapping");
				4816	goto onError;
				4817	}
				4818	++fmt;
				4819	--fmtcnt;
				4820	keystart = fmt;
				4821	/* Skip over balanced parentheses */
				4822	while (pcount > 0 && --fmtcnt >= 0) {
				4823	if (*fmt == ')')
				4824	--pcount;
				4825	else if (*fmt == '(')
				4826	++pcount;
				4827	fmt++;
				4828	}
				4829	keylen = fmt - keystart - 1;
				4830	if (fmtcnt < 0 \|\| pcount > 0) {
				4831	PyErr_SetString(PyExc_ValueError,
				4832	"incomplete format key");
				4833	goto onError;
				4834	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4835	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4836	then looked up since Python uses strings to hold
				4837	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4838	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4839	key = PyUnicode_EncodeUTF8(keystart,
				4840	keylen,
				4841	NULL);
				4842	if (key == NULL)
				4843	goto onError;
				4844	if (args_owned) {
				4845	Py_DECREF(args);
				4846	args_owned = 0;
				4847	}
				4848	args = PyObject_GetItem(dict, key);
				4849	Py_DECREF(key);
				4850	if (args == NULL) {
				4851	goto onError;
				4852	}
				4853	args_owned = 1;
				4854	arglen = -1;
				4855	argidx = -2;
				4856	}
				4857	while (--fmtcnt >= 0) {
				4858	switch (c = *fmt++) {
				4859	case '-': flags \|= F_LJUST; continue;
				4860	case '+': flags \|= F_SIGN; continue;
				4861	case ' ': flags \|= F_BLANK; continue;
				4862	case '#': flags \|= F_ALT; continue;
				4863	case '0': flags \|= F_ZERO; continue;
				4864	}
				4865	break;
				4866	}
				4867	if (c == '*') {
				4868	v = getnextarg(args, arglen, &argidx);
				4869	if (v == NULL)
				4870	goto onError;
				4871	if (!PyInt_Check(v)) {
				4872	PyErr_SetString(PyExc_TypeError,
				4873	"* wants int");
				4874	goto onError;
				4875	}
				4876	width = PyInt_AsLong(v);
				4877	if (width < 0) {
				4878	flags \|= F_LJUST;
				4879	width = -width;
				4880	}
				4881	if (--fmtcnt >= 0)
				4882	c = *fmt++;
				4883	}
				4884	else if (c >= '0' && c <= '9') {
				4885	width = c - '0';
				4886	while (--fmtcnt >= 0) {
				4887	c = *fmt++;
				4888	if (c < '0' \|\| c > '9')
				4889	break;
				4890	if ((width*10) / 10 != width) {
				4891	PyErr_SetString(PyExc_ValueError,
				4892	"width too big");
				4893	goto onError;
				4894	}
				4895	width = width*10 + (c - '0');
				4896	}
				4897	}
				4898	if (c == '.') {
				4899	prec = 0;
				4900	if (--fmtcnt >= 0)
				4901	c = *fmt++;
				4902	if (c == '*') {
				4903	v = getnextarg(args, arglen, &argidx);
				4904	if (v == NULL)
				4905	goto onError;
				4906	if (!PyInt_Check(v)) {
				4907	PyErr_SetString(PyExc_TypeError,
				4908	"* wants int");
				4909	goto onError;
				4910	}
				4911	prec = PyInt_AsLong(v);
				4912	if (prec < 0)
				4913	prec = 0;
				4914	if (--fmtcnt >= 0)
				4915	c = *fmt++;
				4916	}
				4917	else if (c >= '0' && c <= '9') {
				4918	prec = c - '0';
				4919	while (--fmtcnt >= 0) {
				4920	c = Py_CHARMASK(*fmt++);
				4921	if (c < '0' \|\| c > '9')
				4922	break;
				4923	if ((prec*10) / 10 != prec) {
				4924	PyErr_SetString(PyExc_ValueError,
				4925	"prec too big");
				4926	goto onError;
				4927	}
				4928	prec = prec*10 + (c - '0');
				4929	}
				4930	}
				4931	} /* prec */
				4932	if (fmtcnt >= 0) {
				4933	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4934	size = c;
				4935	if (--fmtcnt >= 0)
				4936	c = *fmt++;
				4937	}
				4938	}
				4939	if (fmtcnt < 0) {
				4940	PyErr_SetString(PyExc_ValueError,
				4941	"incomplete format");
				4942	goto onError;
				4943	}
				4944	if (c != '%') {
				4945	v = getnextarg(args, arglen, &argidx);
				4946	if (v == NULL)
				4947	goto onError;
				4948	}
				4949	sign = 0;
				4950	fill = ' ';
				4951	switch (c) {
				4952
				4953	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4954	pbuf = formatbuf;
				4955	/* presume that buffer length is at least 1 */
				4956	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4957	len = 1;
				4958	break;
				4959
				4960	case 's':
				4961	case 'r':
				4962	if (PyUnicode_Check(v) && c == 's') {
				4963	temp = v;
				4964	Py_INCREF(temp);
				4965	}
				4966	else {
				4967	PyObject *unicode;
				4968	if (c == 's')
				4969	temp = PyObject_Str(v);
				4970	else
				4971	temp = PyObject_Repr(v);
				4972	if (temp == NULL)
				4973	goto onError;
				4974	if (!PyString_Check(temp)) {
				4975	/* XXX Note: this should never happen, since
				4976	PyObject_Repr() and PyObject_Str() assure
				4977	this */
				4978	Py_DECREF(temp);
				4979	PyErr_SetString(PyExc_TypeError,
				4980	"%s argument has non-string str()");
				4981	goto onError;
				4982	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4983	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4984	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4985	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4986	"strict");
				4987	Py_DECREF(temp);
				4988	temp = unicode;
				4989	if (temp == NULL)
				4990	goto onError;
				4991	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4992	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4993	len = PyUnicode_GET_SIZE(temp);
				4994	if (prec >= 0 && len > prec)
				4995	len = prec;
				4996	break;
				4997
				4998	case 'i':
				4999	case 'd':
				5000	case 'u':
				5001	case 'o':
				5002	case 'x':
				5003	case 'X':
				5004	if (c == 'i')
				5005	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5006	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5007	temp = formatlong(v, flags, prec, c);
				5008	if (!temp)
				5009	goto onError;
				5010	pbuf = PyUnicode_AS_UNICODE(temp);
				5011	len = PyUnicode_GET_SIZE(temp);
				5012	/* unbounded ints can always produce
				5013	a sign character! */
				5014	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5015	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5016	else {
				5017	pbuf = formatbuf;
				5018	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5019	flags, prec, c, v);
				5020	if (len < 0)
				5021	goto onError;
				5022	/* only d conversion is signed */
				5023	sign = c == 'd';
				5024	}
				5025	if (flags & F_ZERO)
				5026	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5027	break;
				5028
				5029	case 'e':
				5030	case 'E':
				5031	case 'f':
				5032	case 'g':
				5033	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5034	pbuf = formatbuf;
				5035	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5036	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5037	if (len < 0)
				5038	goto onError;
				5039	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5040	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5041	fill = '0';
				5042	break;
				5043
				5044	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5045	pbuf = formatbuf;
				5046	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5047	if (len < 0)
				5048	goto onError;
				5049	break;
				5050
				5051	default:
				5052	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5053	"unsupported format character '%c' (0x%x) "
				5054	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5055	(31<=c && c<=126) ? c : '?',
				5056	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5057	goto onError;
				5058	}
				5059	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5060	if (pbuf == '-' \|\| pbuf == '+') {
				5061	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5062	len--;
				5063	}
				5064	else if (flags & F_SIGN)
				5065	sign = '+';
				5066	else if (flags & F_BLANK)
				5067	sign = ' ';
				5068	else
				5069	sign = 0;
				5070	}
				5071	if (width < len)
				5072	width = len;
				5073	if (rescnt < width + (sign != 0)) {
				5074	reslen -= rescnt;
				5075	rescnt = width + fmtcnt + 100;
				5076	reslen += rescnt;
				5077	if (_PyUnicode_Resize(result, reslen) < 0)
				5078	return NULL;
				5079	res = PyUnicode_AS_UNICODE(result)
				5080	+ reslen - rescnt;
				5081	}
				5082	if (sign) {
				5083	if (fill != ' ')
				5084	*res++ = sign;
				5085	rescnt--;
				5086	if (width > len)
				5087	width--;
				5088	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5089	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5090	assert(pbuf[0] == '0');
				5091	assert(pbuf[1] == c);
				5092	if (fill != ' ') {
				5093	res++ = pbuf++;
				5094	res++ = pbuf++;
				5095	}
				5096	rescnt -= 2;
				5097	width -= 2;
				5098	if (width < 0)
				5099	width = 0;
				5100	len -= 2;
				5101	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5102	if (width > len && !(flags & F_LJUST)) {
				5103	do {
				5104	--rescnt;
				5105	*res++ = fill;
				5106	} while (--width > len);
				5107	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5108	if (fill == ' ') {
				5109	if (sign)
				5110	*res++ = sign;
				5111	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5112	assert(pbuf[0] == '0');
				5113	assert(pbuf[1] == c);
				5114	res++ = pbuf++;
				5115	res++ = pbuf++;
				5116	}
				5117	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5118	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5119	res += len;
				5120	rescnt -= len;
				5121	while (--width >= len) {
				5122	--rescnt;
				5123	*res++ = ' ';
				5124	}
				5125	if (dict && (argidx < arglen) && c != '%') {
				5126	PyErr_SetString(PyExc_TypeError,
				5127	"not all arguments converted");
				5128	goto onError;
				5129	}
				5130	Py_XDECREF(temp);
				5131	} /* '%' */
				5132	} /* until end */
				5133	if (argidx < arglen && !dict) {
				5134	PyErr_SetString(PyExc_TypeError,
				5135	"not all arguments converted");
				5136	goto onError;
				5137	}
				5138
				5139	if (args_owned) {
				5140	Py_DECREF(args);
				5141	}
				5142	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5143	if (_PyUnicode_Resize(result, reslen - rescnt))
				5144	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5145	return (PyObject *)result;
				5146
				5147	onError:
				5148	Py_XDECREF(result);
				5149	Py_DECREF(uformat);
				5150	if (args_owned) {
				5151	Py_DECREF(args);
				5152	}
				5153	return NULL;
				5154	}
				5155
				5156	static PyBufferProcs unicode_as_buffer = {
				5157	(getreadbufferproc) unicode_buffer_getreadbuf,
				5158	(getwritebufferproc) unicode_buffer_getwritebuf,
				5159	(getsegcountproc) unicode_buffer_getsegcount,
				5160	(getcharbufferproc) unicode_buffer_getcharbuf,
				5161	};
				5162
				5163	PyTypeObject PyUnicode_Type = {
				5164	PyObject_HEAD_INIT(&PyType_Type)
				5165	0, /* ob_size */
				5166	"unicode", /* tp_name */
				5167	sizeof(PyUnicodeObject), /* tp_size */
				5168	0, /* tp_itemsize */
				5169	/* Slots */
				5170	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5171	0, /* tp_print */
				5172	(getattrfunc)unicode_getattr, /* tp_getattr */
				5173	0, /* tp_setattr */
				5174	(cmpfunc) unicode_compare, /* tp_compare */
				5175	(reprfunc) unicode_repr, /* tp_repr */
				5176	0, /* tp_as_number */
				5177	&unicode_as_sequence, /* tp_as_sequence */
				5178	0, /* tp_as_mapping */
				5179	(hashfunc) unicode_hash, /* tp_hash*/
				5180	0, /* tp_call*/
				5181	(reprfunc) unicode_str, /* tp_str */
				5182	(getattrofunc) NULL, /* tp_getattro */
				5183	(setattrofunc) NULL, /* tp_setattro */
				5184	&unicode_as_buffer, /* tp_as_buffer */
				5185	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5186	};
				5187
				5188	/* Initialize the Unicode implementation */
				5189
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5190	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5191	{
				5192	/* Doublecheck the configuration... */
				5193	if (sizeof(Py_UNICODE) != 2)
				5194	Py_FatalError("Unicode configuration error: "
				5195	"sizeof(Py_UNICODE) != 2 bytes");
				5196
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5197	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5198	unicode_freelist = NULL;
				5199	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5200	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5201	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5202	}
				5203
				5204	/* Finalize the Unicode implementation */
				5205
				5206	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5207	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5208	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5209	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5210
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5211	Py_XDECREF(unicode_empty);
				5212	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5213
				5214	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5215	PyUnicodeObject *v = u;
				5216	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5217	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5218	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5219	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5220	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5221	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5222	unicode_freelist = NULL;
				5223	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5224	}